{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "f2b656df",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"import numpy as np"
]
},
{
"cell_type": "markdown",
"id": "baa3758b",
"metadata": {},
"source": [
"# Objectives"
]
},
{
"cell_type": "markdown",
"id": "bdb464ad",
"metadata": {},
"source": [
"The objectives of this analysis are to find an accurate machine learning model that is not opaque and to determine what features contribute the most significantly to covid death from the CSV file provided."
]
},
{
"cell_type": "markdown",
"id": "932008d3",
"metadata": {},
"source": [
"## Importing Data"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "73e187df",
"metadata": {},
"outputs": [],
"source": [
"# Import CSV file to a dataframe and format the columns\n",
"df = pd.read_csv(\"data/COVID-19_Reported_Patient_Impact_and_Hospital_Capacity_by_State_Timeseries__RAW_.csv\")\n",
"df['date'] = pd.to_datetime(df['date'])"
]
},
{
"cell_type": "markdown",
"id": "a71b9337",
"metadata": {},
"source": [
"## Describing Data"
]
},
{
"cell_type": "markdown",
"id": "dad4fe6b",
"metadata": {},
"source": [
"The dataset is extremely wide with 133 columns, each with different levels of data completeness. All data is numeric except for state and date. \n",
"\n",
"There are 64703 entries from the years 2020 to 2023. "
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "002907f0",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" state \n",
" date \n",
" critical_staffing_shortage_today_yes \n",
" critical_staffing_shortage_today_no \n",
" critical_staffing_shortage_today_not_reported \n",
" critical_staffing_shortage_anticipated_within_week_yes \n",
" critical_staffing_shortage_anticipated_within_week_no \n",
" critical_staffing_shortage_anticipated_within_week_not_reported \n",
" hospital_onset_covid \n",
" hospital_onset_covid_coverage \n",
" ... \n",
" previous_day_admission_pediatric_covid_confirmed_5_11 \n",
" previous_day_admission_pediatric_covid_confirmed_5_11_coverage \n",
" previous_day_admission_pediatric_covid_confirmed_unknown \n",
" previous_day_admission_pediatric_covid_confirmed_unknown_coverage \n",
" staffed_icu_pediatric_patients_confirmed_covid \n",
" staffed_icu_pediatric_patients_confirmed_covid_coverage \n",
" staffed_pediatric_icu_bed_occupancy \n",
" staffed_pediatric_icu_bed_occupancy_coverage \n",
" total_staffed_pediatric_icu_beds \n",
" total_staffed_pediatric_icu_beds_coverage \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" RI \n",
" 2021-02-26 \n",
" 4 \n",
" 10 \n",
" 1 \n",
" 4 \n",
" 10 \n",
" 1 \n",
" 6.0 \n",
" 14 \n",
" ... \n",
" NaN \n",
" 0 \n",
" NaN \n",
" 0 \n",
" NaN \n",
" 0 \n",
" 68.0 \n",
" 14 \n",
" 80.0 \n",
" 14 \n",
" \n",
" \n",
" 1 \n",
" MA \n",
" 2021-02-24 \n",
" 10 \n",
" 90 \n",
" 1 \n",
" 9 \n",
" 91 \n",
" 1 \n",
" 40.0 \n",
" 100 \n",
" ... \n",
" NaN \n",
" 0 \n",
" NaN \n",
" 0 \n",
" NaN \n",
" 0 \n",
" NaN \n",
" 0 \n",
" NaN \n",
" 0 \n",
" \n",
" \n",
" 2 \n",
" NE \n",
" 2021-02-17 \n",
" 10 \n",
" 90 \n",
" 1 \n",
" 17 \n",
" 83 \n",
" 1 \n",
" 3.0 \n",
" 100 \n",
" ... \n",
" NaN \n",
" 0 \n",
" NaN \n",
" 0 \n",
" NaN \n",
" 0 \n",
" 0.0 \n",
" 12 \n",
" 0.0 \n",
" 12 \n",
" \n",
" \n",
" 3 \n",
" ME \n",
" 2021-01-30 \n",
" 2 \n",
" 29 \n",
" 8 \n",
" 4 \n",
" 27 \n",
" 8 \n",
" 2.0 \n",
" 38 \n",
" ... \n",
" NaN \n",
" 0 \n",
" NaN \n",
" 0 \n",
" NaN \n",
" 0 \n",
" 47.0 \n",
" 38 \n",
" 54.0 \n",
" 38 \n",
" \n",
" \n",
" 4 \n",
" NH \n",
" 2021-01-30 \n",
" 6 \n",
" 23 \n",
" 1 \n",
" 8 \n",
" 21 \n",
" 1 \n",
" 8.0 \n",
" 30 \n",
" ... \n",
" NaN \n",
" 0 \n",
" NaN \n",
" 0 \n",
" NaN \n",
" 0 \n",
" 28.0 \n",
" 17 \n",
" 39.0 \n",
" 17 \n",
" \n",
" \n",
"
\n",
"
5 rows × 135 columns
\n",
"
"
],
"text/plain": [
" state date critical_staffing_shortage_today_yes \\\n",
"0 RI 2021-02-26 4 \n",
"1 MA 2021-02-24 10 \n",
"2 NE 2021-02-17 10 \n",
"3 ME 2021-01-30 2 \n",
"4 NH 2021-01-30 6 \n",
"\n",
" critical_staffing_shortage_today_no \\\n",
"0 10 \n",
"1 90 \n",
"2 90 \n",
"3 29 \n",
"4 23 \n",
"\n",
" critical_staffing_shortage_today_not_reported \\\n",
"0 1 \n",
"1 1 \n",
"2 1 \n",
"3 8 \n",
"4 1 \n",
"\n",
" critical_staffing_shortage_anticipated_within_week_yes \\\n",
"0 4 \n",
"1 9 \n",
"2 17 \n",
"3 4 \n",
"4 8 \n",
"\n",
" critical_staffing_shortage_anticipated_within_week_no \\\n",
"0 10 \n",
"1 91 \n",
"2 83 \n",
"3 27 \n",
"4 21 \n",
"\n",
" critical_staffing_shortage_anticipated_within_week_not_reported \\\n",
"0 1 \n",
"1 1 \n",
"2 1 \n",
"3 8 \n",
"4 1 \n",
"\n",
" hospital_onset_covid hospital_onset_covid_coverage ... \\\n",
"0 6.0 14 ... \n",
"1 40.0 100 ... \n",
"2 3.0 100 ... \n",
"3 2.0 38 ... \n",
"4 8.0 30 ... \n",
"\n",
" previous_day_admission_pediatric_covid_confirmed_5_11 \\\n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN \n",
"\n",
" previous_day_admission_pediatric_covid_confirmed_5_11_coverage \\\n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
" previous_day_admission_pediatric_covid_confirmed_unknown \\\n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN \n",
"\n",
" previous_day_admission_pediatric_covid_confirmed_unknown_coverage \\\n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
" staffed_icu_pediatric_patients_confirmed_covid \\\n",
"0 NaN \n",
"1 NaN \n",
"2 NaN \n",
"3 NaN \n",
"4 NaN \n",
"\n",
" staffed_icu_pediatric_patients_confirmed_covid_coverage \\\n",
"0 0 \n",
"1 0 \n",
"2 0 \n",
"3 0 \n",
"4 0 \n",
"\n",
" staffed_pediatric_icu_bed_occupancy \\\n",
"0 68.0 \n",
"1 NaN \n",
"2 0.0 \n",
"3 47.0 \n",
"4 28.0 \n",
"\n",
" staffed_pediatric_icu_bed_occupancy_coverage \\\n",
"0 14 \n",
"1 0 \n",
"2 12 \n",
"3 38 \n",
"4 17 \n",
"\n",
" total_staffed_pediatric_icu_beds total_staffed_pediatric_icu_beds_coverage \n",
"0 80.0 14 \n",
"1 NaN 0 \n",
"2 0.0 12 \n",
"3 54.0 38 \n",
"4 39.0 17 \n",
"\n",
"[5 rows x 135 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "e3571ddf",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 64703 entries, 0 to 64702\n",
"Columns: 135 entries, state to total_staffed_pediatric_icu_beds_coverage\n",
"dtypes: datetime64[ns](1), float64(77), int64(56), object(1)\n",
"memory usage: 66.6+ MB\n"
]
}
],
"source": [
"# Print initial dataframe info\n",
"df.info()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "e4781cfd",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" date \n",
" critical_staffing_shortage_today_yes \n",
" critical_staffing_shortage_today_no \n",
" critical_staffing_shortage_today_not_reported \n",
" critical_staffing_shortage_anticipated_within_week_yes \n",
" critical_staffing_shortage_anticipated_within_week_no \n",
" critical_staffing_shortage_anticipated_within_week_not_reported \n",
" hospital_onset_covid \n",
" hospital_onset_covid_coverage \n",
" inpatient_beds \n",
" ... \n",
" previous_day_admission_pediatric_covid_confirmed_5_11 \n",
" previous_day_admission_pediatric_covid_confirmed_5_11_coverage \n",
" previous_day_admission_pediatric_covid_confirmed_unknown \n",
" previous_day_admission_pediatric_covid_confirmed_unknown_coverage \n",
" staffed_icu_pediatric_patients_confirmed_covid \n",
" staffed_icu_pediatric_patients_confirmed_covid_coverage \n",
" staffed_pediatric_icu_bed_occupancy \n",
" staffed_pediatric_icu_bed_occupancy_coverage \n",
" total_staffed_pediatric_icu_beds \n",
" total_staffed_pediatric_icu_beds_coverage \n",
" \n",
" \n",
" \n",
" \n",
" count \n",
" 64703 \n",
" 64703.000000 \n",
" 64703.000000 \n",
" 64703.000000 \n",
" 64703.000000 \n",
" 64703.000000 \n",
" 64703.000000 \n",
" 64446.000000 \n",
" 64703.000000 \n",
" 64698.000000 \n",
" ... \n",
" 27677.000000 \n",
" 64703.000000 \n",
" 28407.000000 \n",
" 64703.000000 \n",
" 34554.000000 \n",
" 64703.000000 \n",
" 56400.000000 \n",
" 64703.000000 \n",
" 56387.000000 \n",
" 64703.000000 \n",
" \n",
" \n",
" mean \n",
" 2021-10-23 20:33:20.105095424 \n",
" 9.334343 \n",
" 55.179404 \n",
" 39.791323 \n",
" 14.525092 \n",
" 63.044789 \n",
" 26.735190 \n",
" 23.640924 \n",
" 98.615644 \n",
" 13338.298804 \n",
" ... \n",
" 0.639412 \n",
" 40.596958 \n",
" 0.815890 \n",
" 42.540114 \n",
" 2.922990 \n",
" 48.024466 \n",
" 169.880177 \n",
" 74.827844 \n",
" 257.038963 \n",
" 74.662025 \n",
" \n",
" \n",
" min \n",
" 2020-01-01 00:00:00 \n",
" 0.000000 \n",
" 0.000000 \n",
" 0.000000 \n",
" 0.000000 \n",
" 0.000000 \n",
" 0.000000 \n",
" 0.000000 \n",
" 0.000000 \n",
" 0.000000 \n",
" ... \n",
" 0.000000 \n",
" 0.000000 \n",
" -1.000000 \n",
" 0.000000 \n",
" 0.000000 \n",
" 0.000000 \n",
" 0.000000 \n",
" 0.000000 \n",
" 0.000000 \n",
" 0.000000 \n",
" \n",
" \n",
" 25% \n",
" 2020-12-26 00:00:00 \n",
" 0.000000 \n",
" 6.000000 \n",
" 3.000000 \n",
" 2.000000 \n",
" 16.000000 \n",
" 2.000000 \n",
" 2.000000 \n",
" 38.000000 \n",
" 3158.000000 \n",
" ... \n",
" 0.000000 \n",
" 0.000000 \n",
" 0.000000 \n",
" 0.000000 \n",
" 0.000000 \n",
" 0.000000 \n",
" 17.000000 \n",
" 13.000000 \n",
" 29.000000 \n",
" 13.000000 \n",
" \n",
" \n",
" 50% \n",
" 2021-10-26 00:00:00 \n",
" 3.000000 \n",
" 37.000000 \n",
" 14.000000 \n",
" 8.000000 \n",
" 52.000000 \n",
" 8.000000 \n",
" 7.000000 \n",
" 86.000000 \n",
" 8891.500000 \n",
" ... \n",
" 0.000000 \n",
" 0.000000 \n",
" 0.000000 \n",
" 0.000000 \n",
" 1.000000 \n",
" 4.000000 \n",
" 72.000000 \n",
" 52.000000 \n",
" 110.000000 \n",
" 52.000000 \n",
" \n",
" \n",
" 75% \n",
" 2022-08-22 00:00:00 \n",
" 12.000000 \n",
" 87.000000 \n",
" 47.000000 \n",
" 21.000000 \n",
" 90.000000 \n",
" 26.000000 \n",
" 23.000000 \n",
" 129.000000 \n",
" 17106.750000 \n",
" ... \n",
" 1.000000 \n",
" 58.000000 \n",
" 0.000000 \n",
" 62.000000 \n",
" 3.000000 \n",
" 84.000000 \n",
" 191.000000 \n",
" 108.000000 \n",
" 319.000000 \n",
" 108.000000 \n",
" \n",
" \n",
" max \n",
" 2023-06-17 00:00:00 \n",
" 191.000000 \n",
" 494.000000 \n",
" 523.000000 \n",
" 204.000000 \n",
" 469.000000 \n",
" 523.000000 \n",
" 1334.000000 \n",
" 597.000000 \n",
" 108966.000000 \n",
" ... \n",
" 101.000000 \n",
" 591.000000 \n",
" 155.000000 \n",
" 595.000000 \n",
" 346.000000 \n",
" 595.000000 \n",
" 2580.000000 \n",
" 597.000000 \n",
" 3917.000000 \n",
" 597.000000 \n",
" \n",
" \n",
" std \n",
" NaN \n",
" 16.287815 \n",
" 62.544193 \n",
" 66.802128 \n",
" 20.534351 \n",
" 61.075534 \n",
" 58.251265 \n",
" 53.715488 \n",
" 91.323635 \n",
" 14742.114302 \n",
" ... \n",
" 1.605329 \n",
" 73.441597 \n",
" 5.183662 \n",
" 75.923520 \n",
" 6.180913 \n",
" 77.132074 \n",
" 287.636544 \n",
" 86.462506 \n",
" 438.611268 \n",
" 86.485956 \n",
" \n",
" \n",
"
\n",
"
8 rows × 134 columns
\n",
"
"
],
"text/plain": [
" date critical_staffing_shortage_today_yes \\\n",
"count 64703 64703.000000 \n",
"mean 2021-10-23 20:33:20.105095424 9.334343 \n",
"min 2020-01-01 00:00:00 0.000000 \n",
"25% 2020-12-26 00:00:00 0.000000 \n",
"50% 2021-10-26 00:00:00 3.000000 \n",
"75% 2022-08-22 00:00:00 12.000000 \n",
"max 2023-06-17 00:00:00 191.000000 \n",
"std NaN 16.287815 \n",
"\n",
" critical_staffing_shortage_today_no \\\n",
"count 64703.000000 \n",
"mean 55.179404 \n",
"min 0.000000 \n",
"25% 6.000000 \n",
"50% 37.000000 \n",
"75% 87.000000 \n",
"max 494.000000 \n",
"std 62.544193 \n",
"\n",
" critical_staffing_shortage_today_not_reported \\\n",
"count 64703.000000 \n",
"mean 39.791323 \n",
"min 0.000000 \n",
"25% 3.000000 \n",
"50% 14.000000 \n",
"75% 47.000000 \n",
"max 523.000000 \n",
"std 66.802128 \n",
"\n",
" critical_staffing_shortage_anticipated_within_week_yes \\\n",
"count 64703.000000 \n",
"mean 14.525092 \n",
"min 0.000000 \n",
"25% 2.000000 \n",
"50% 8.000000 \n",
"75% 21.000000 \n",
"max 204.000000 \n",
"std 20.534351 \n",
"\n",
" critical_staffing_shortage_anticipated_within_week_no \\\n",
"count 64703.000000 \n",
"mean 63.044789 \n",
"min 0.000000 \n",
"25% 16.000000 \n",
"50% 52.000000 \n",
"75% 90.000000 \n",
"max 469.000000 \n",
"std 61.075534 \n",
"\n",
" critical_staffing_shortage_anticipated_within_week_not_reported \\\n",
"count 64703.000000 \n",
"mean 26.735190 \n",
"min 0.000000 \n",
"25% 2.000000 \n",
"50% 8.000000 \n",
"75% 26.000000 \n",
"max 523.000000 \n",
"std 58.251265 \n",
"\n",
" hospital_onset_covid hospital_onset_covid_coverage inpatient_beds \\\n",
"count 64446.000000 64703.000000 64698.000000 \n",
"mean 23.640924 98.615644 13338.298804 \n",
"min 0.000000 0.000000 0.000000 \n",
"25% 2.000000 38.000000 3158.000000 \n",
"50% 7.000000 86.000000 8891.500000 \n",
"75% 23.000000 129.000000 17106.750000 \n",
"max 1334.000000 597.000000 108966.000000 \n",
"std 53.715488 91.323635 14742.114302 \n",
"\n",
" ... previous_day_admission_pediatric_covid_confirmed_5_11 \\\n",
"count ... 27677.000000 \n",
"mean ... 0.639412 \n",
"min ... 0.000000 \n",
"25% ... 0.000000 \n",
"50% ... 0.000000 \n",
"75% ... 1.000000 \n",
"max ... 101.000000 \n",
"std ... 1.605329 \n",
"\n",
" previous_day_admission_pediatric_covid_confirmed_5_11_coverage \\\n",
"count 64703.000000 \n",
"mean 40.596958 \n",
"min 0.000000 \n",
"25% 0.000000 \n",
"50% 0.000000 \n",
"75% 58.000000 \n",
"max 591.000000 \n",
"std 73.441597 \n",
"\n",
" previous_day_admission_pediatric_covid_confirmed_unknown \\\n",
"count 28407.000000 \n",
"mean 0.815890 \n",
"min -1.000000 \n",
"25% 0.000000 \n",
"50% 0.000000 \n",
"75% 0.000000 \n",
"max 155.000000 \n",
"std 5.183662 \n",
"\n",
" previous_day_admission_pediatric_covid_confirmed_unknown_coverage \\\n",
"count 64703.000000 \n",
"mean 42.540114 \n",
"min 0.000000 \n",
"25% 0.000000 \n",
"50% 0.000000 \n",
"75% 62.000000 \n",
"max 595.000000 \n",
"std 75.923520 \n",
"\n",
" staffed_icu_pediatric_patients_confirmed_covid \\\n",
"count 34554.000000 \n",
"mean 2.922990 \n",
"min 0.000000 \n",
"25% 0.000000 \n",
"50% 1.000000 \n",
"75% 3.000000 \n",
"max 346.000000 \n",
"std 6.180913 \n",
"\n",
" staffed_icu_pediatric_patients_confirmed_covid_coverage \\\n",
"count 64703.000000 \n",
"mean 48.024466 \n",
"min 0.000000 \n",
"25% 0.000000 \n",
"50% 4.000000 \n",
"75% 84.000000 \n",
"max 595.000000 \n",
"std 77.132074 \n",
"\n",
" staffed_pediatric_icu_bed_occupancy \\\n",
"count 56400.000000 \n",
"mean 169.880177 \n",
"min 0.000000 \n",
"25% 17.000000 \n",
"50% 72.000000 \n",
"75% 191.000000 \n",
"max 2580.000000 \n",
"std 287.636544 \n",
"\n",
" staffed_pediatric_icu_bed_occupancy_coverage \\\n",
"count 64703.000000 \n",
"mean 74.827844 \n",
"min 0.000000 \n",
"25% 13.000000 \n",
"50% 52.000000 \n",
"75% 108.000000 \n",
"max 597.000000 \n",
"std 86.462506 \n",
"\n",
" total_staffed_pediatric_icu_beds \\\n",
"count 56387.000000 \n",
"mean 257.038963 \n",
"min 0.000000 \n",
"25% 29.000000 \n",
"50% 110.000000 \n",
"75% 319.000000 \n",
"max 3917.000000 \n",
"std 438.611268 \n",
"\n",
" total_staffed_pediatric_icu_beds_coverage \n",
"count 64703.000000 \n",
"mean 74.662025 \n",
"min 0.000000 \n",
"25% 13.000000 \n",
"50% 52.000000 \n",
"75% 108.000000 \n",
"max 597.000000 \n",
"std 86.485956 \n",
"\n",
"[8 rows x 134 columns]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Print initial dataframe description\n",
"df.describe()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "735d71be",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" null_values \n",
" \n",
" \n",
" column \n",
" \n",
" \n",
" \n",
" \n",
" \n",
" geocoded_state \n",
" 64703 \n",
" \n",
" \n",
" previous_day_admission_pediatric_covid_confirmed_12_17 \n",
" 37040 \n",
" \n",
" \n",
" previous_day_admission_pediatric_covid_confirmed_5_11 \n",
" 37026 \n",
" \n",
" \n",
" previous_day_admission_pediatric_covid_confirmed_0_4 \n",
" 36410 \n",
" \n",
" \n",
" previous_day_admission_pediatric_covid_confirmed_unknown \n",
" 36296 \n",
" \n",
" \n",
" staffed_icu_pediatric_patients_confirmed_covid \n",
" 30149 \n",
" \n",
" \n",
" on_hand_supply_therapeutic_c_bamlanivimab_etesevimab_courses \n",
" 21210 \n",
" \n",
" \n",
" previous_week_therapeutic_c_bamlanivimab_etesevimab_courses_used \n",
" 21193 \n",
" \n",
" \n",
" on_hand_supply_therapeutic_b_bamlanivimab_courses \n",
" 17718 \n",
" \n",
" \n",
" previous_week_therapeutic_b_bamlanivimab_courses_used \n",
" 17686 \n",
" \n",
" \n",
" on_hand_supply_therapeutic_a_casirivimab_imdevimab_courses \n",
" 16528 \n",
" \n",
" \n",
" previous_week_therapeutic_a_casirivimab_imdevimab_courses_used \n",
" 16527 \n",
" \n",
" \n",
" previous_day_deaths_covid_and_influenza \n",
" 12888 \n",
" \n",
" \n",
" total_patients_hospitalized_confirmed_influenza_and_covid \n",
" 12884 \n",
" \n",
" \n",
" previous_day_deaths_influenza \n",
" 12746 \n",
" \n",
" \n",
" total_patients_hospitalized_confirmed_influenza \n",
" 11766 \n",
" \n",
" \n",
" icu_patients_confirmed_influenza \n",
" 11710 \n",
" \n",
" \n",
" previous_day_admission_influenza_confirmed \n",
" 11709 \n",
" \n",
" \n",
" total_staffed_pediatric_icu_beds \n",
" 8316 \n",
" \n",
" \n",
" all_pediatric_inpatient_beds \n",
" 8314 \n",
" \n",
" \n",
" all_pediatric_inpatient_bed_occupied \n",
" 8303 \n",
" \n",
" \n",
" staffed_pediatric_icu_bed_occupancy \n",
" 8303 \n",
" \n",
" \n",
" previous_day_admission_adult_covid_suspected_80+ \n",
" 8158 \n",
" \n",
" \n",
" previous_day_admission_adult_covid_suspected_50-59 \n",
" 8156 \n",
" \n",
" \n",
" previous_day_admission_adult_covid_suspected_40-49 \n",
" 8154 \n",
" \n",
" \n",
" previous_day_admission_adult_covid_suspected_70-79 \n",
" 8151 \n",
" \n",
" \n",
" previous_day_admission_adult_covid_suspected_60-69 \n",
" 8150 \n",
" \n",
" \n",
" previous_day_admission_adult_covid_suspected_20-29 \n",
" 8146 \n",
" \n",
" \n",
" previous_day_admission_adult_covid_suspected_30-39 \n",
" 8145 \n",
" \n",
" \n",
" previous_day_admission_adult_covid_confirmed_40-49 \n",
" 8127 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" null_values\n",
"column \n",
"geocoded_state 64703\n",
"previous_day_admission_pediatric_covid_confirme... 37040\n",
"previous_day_admission_pediatric_covid_confirme... 37026\n",
"previous_day_admission_pediatric_covid_confirme... 36410\n",
"previous_day_admission_pediatric_covid_confirme... 36296\n",
"staffed_icu_pediatric_patients_confirmed_covid 30149\n",
"on_hand_supply_therapeutic_c_bamlanivimab_etese... 21210\n",
"previous_week_therapeutic_c_bamlanivimab_etesev... 21193\n",
"on_hand_supply_therapeutic_b_bamlanivimab_courses 17718\n",
"previous_week_therapeutic_b_bamlanivimab_course... 17686\n",
"on_hand_supply_therapeutic_a_casirivimab_imdevi... 16528\n",
"previous_week_therapeutic_a_casirivimab_imdevim... 16527\n",
"previous_day_deaths_covid_and_influenza 12888\n",
"total_patients_hospitalized_confirmed_influenza... 12884\n",
"previous_day_deaths_influenza 12746\n",
"total_patients_hospitalized_confirmed_influenza 11766\n",
"icu_patients_confirmed_influenza 11710\n",
"previous_day_admission_influenza_confirmed 11709\n",
"total_staffed_pediatric_icu_beds 8316\n",
"all_pediatric_inpatient_beds 8314\n",
"all_pediatric_inpatient_bed_occupied 8303\n",
"staffed_pediatric_icu_bed_occupancy 8303\n",
"previous_day_admission_adult_covid_suspected_80+ 8158\n",
"previous_day_admission_adult_covid_suspected_50-59 8156\n",
"previous_day_admission_adult_covid_suspected_40-49 8154\n",
"previous_day_admission_adult_covid_suspected_70-79 8151\n",
"previous_day_admission_adult_covid_suspected_60-69 8150\n",
"previous_day_admission_adult_covid_suspected_20-29 8146\n",
"previous_day_admission_adult_covid_suspected_30-39 8145\n",
"previous_day_admission_adult_covid_confirmed_40-49 8127"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Create null_df showing null value counts\n",
"null_df = (df\n",
" .isnull()\n",
" .sum()\n",
" .to_frame()\n",
" .reset_index()\n",
" .rename(columns={'index':'column', 0:'null_values'})\n",
" .sort_values(by='null_values', ascending=False)\n",
" .reset_index(drop=True)\n",
" .set_index('column')\n",
" )\n",
"\n",
"# Filter null_df to only columns that have null values\n",
"null_df = null_df[null_df['null_values'] != 0]\n",
"\n",
"# Print top 30 null value counts\n",
"null_df.head(30)"
]
},
{
"cell_type": "markdown",
"id": "6f984a76",
"metadata": {},
"source": [
"# Visualizing Data"
]
},
{
"cell_type": "markdown",
"id": "32c5e231",
"metadata": {},
"source": [
"The data clearly has holes in it, as demonstrated in the two following line charts. This is going to be fixed later through fixing NaN values in the dataframe."
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "c2a3a63f",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Plots with similar trends showing missing values\n",
"df[['date', 'percent_of_inpatients_with_covid', 'inpatient_bed_covid_utilization', 'adult_icu_bed_covid_utilization']].groupby('date').mean().resample('W').mean().plot()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "f698ca81",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Adult ICU Bed Utilization and Inpatient Beds Utilization line charts\n",
"df[['date', 'adult_icu_bed_utilization', 'inpatient_beds_utilization']].groupby('date').mean().resample('W').mean().plot()"
]
},
{
"cell_type": "markdown",
"id": "152b4541",
"metadata": {},
"source": [
"# Preprocessing Data"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "e6622e75",
"metadata": {},
"outputs": [],
"source": [
"# Drop geocoded_state as it is empty\n",
"df.drop(columns='geocoded_state', inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "eff4b2ad",
"metadata": {},
"outputs": [],
"source": [
"# Create a mask and remove the beginning days of pandemic with little information\n",
"start_date = '2020-01-01'\n",
"end_date = '2020-08-01'\n",
"mask = (df['date'] >= start_date) & (df['date'] <= end_date)\n",
"\n",
"# Apply mask to dataframe to filter by date\n",
"df = df.loc[~mask].reset_index(drop=True)"
]
},
{
"cell_type": "markdown",
"id": "5730c055",
"metadata": {},
"source": [
"## Fix NaN Values"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "868f4490",
"metadata": {},
"outputs": [],
"source": [
"# Forward fill all null values and remove the rest\n",
"df = df.fillna(method='ffill').dropna().reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "f9db538d",
"metadata": {},
"outputs": [],
"source": [
"# Create null_df showing null value counts\n",
"null_df = (df\n",
" .isnull()\n",
" .sum()\n",
" .to_frame()\n",
" .reset_index()\n",
" .rename(columns={'index':'column', 0:'null_values'})\n",
" .sort_values(by='null_values', ascending=False)\n",
" .reset_index(drop=True)\n",
" .set_index('column')\n",
" )\n",
"\n",
"# Filter null_df to only columns that have null values\n",
"null_df = null_df[null_df['null_values'] != 0]"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "df9fdde7",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" null_values \n",
" \n",
" \n",
" column \n",
" \n",
" \n",
" \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [null_values]\n",
"Index: []"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Print null values\n",
"# There are no more null values\n",
"null_df"
]
},
{
"cell_type": "markdown",
"id": "237df7d5",
"metadata": {},
"source": [
"# Model Training"
]
},
{
"cell_type": "markdown",
"id": "c24179d4",
"metadata": {},
"source": [
"I chose two different models here including:\n",
"* Random Forest\n",
"* Decision Tree\n",
"* Linear Regression\n",
"\n",
"Each of these models are first analyzed using a .20 test and .80 train split.\n",
"\n",
"The results are shown under the Accuracy Results section. Each model showed improvement, especially decision trees which shot up by 8% accuracy."
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "d8100292",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"from sklearn.preprocessing import StandardScaler\n",
"from sklearn.metrics import mean_squared_error"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "d80875c2",
"metadata": {},
"outputs": [],
"source": [
"# Remove unnecessary columns and the result variable\n",
"X = df.drop(columns={'state', 'date', 'deaths_covid'})\n",
"\n",
"# Extract result variable\n",
"y = df['deaths_covid']\n",
"\n",
"# Set random_state constant\n",
"random_state = 42"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "ffa1b405",
"metadata": {},
"outputs": [],
"source": [
"# Split data into train/test splits\n",
"X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = random_state)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "be61a67e",
"metadata": {},
"outputs": [],
"source": [
"# Scale data using StandardScaler()\n",
"sc = StandardScaler()\n",
"X_train = sc.fit_transform(X_train)\n",
"X_test = sc.transform(X_test)"
]
},
{
"cell_type": "markdown",
"id": "0818ebc6",
"metadata": {},
"source": [
"## Random Forest"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "d909773e",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.ensemble import RandomForestClassifier"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "20cf3543",
"metadata": {},
"outputs": [],
"source": [
"# Define classifier\n",
"rfc = RandomForestClassifier(random_state=random_state)\n",
"\n",
"# Run predictions using random forest classifier\n",
"rfc.fit(X_train, y_train)\n",
"pred_rfc = rfc.predict(X_test)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "2dd749f9",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Random Forest Root Mean Squared Error: 10.288897409494494\n"
]
}
],
"source": [
"# Calculate mean squared error\n",
"mse_rf = mean_squared_error(y_test, pred_rfc)\n",
"\n",
"# Calculate root mean squared error\n",
"rmse_rf = np.sqrt(mse_rf) \n",
"\n",
"# Print RMSE\n",
"print(\"Random Forest Root Mean Squared Error:\", rmse_rf)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "fc764cdc",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" Feature \n",
" Importance \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" staffed_icu_adult_patients_confirmed_covid \n",
" 0.026500 \n",
" \n",
" \n",
" 1 \n",
" staffed_icu_adult_patients_confirmed_and_suspe... \n",
" 0.025898 \n",
" \n",
" \n",
" 2 \n",
" adult_icu_bed_covid_utilization_numerator \n",
" 0.023353 \n",
" \n",
" \n",
" 3 \n",
" percent_of_inpatients_with_covid_numerator \n",
" 0.020792 \n",
" \n",
" \n",
" 4 \n",
" total_adult_patients_hospitalized_confirmed_covid \n",
" 0.019461 \n",
" \n",
" \n",
" 5 \n",
" total_adult_patients_hospitalized_confirmed_an... \n",
" 0.019099 \n",
" \n",
" \n",
" 6 \n",
" deaths_covid_coverage \n",
" 0.018980 \n",
" \n",
" \n",
" 7 \n",
" inpatient_bed_covid_utilization_numerator \n",
" 0.018109 \n",
" \n",
" \n",
" 8 \n",
" previous_day_admission_adult_covid_confirmed_5... \n",
" 0.016019 \n",
" \n",
" \n",
" 9 \n",
" inpatient_beds_used_covid \n",
" 0.015906 \n",
" \n",
" \n",
" 10 \n",
" adult_icu_bed_covid_utilization \n",
" 0.015842 \n",
" \n",
" \n",
" 11 \n",
" previous_day_admission_adult_covid_confirmed \n",
" 0.014609 \n",
" \n",
" \n",
" 12 \n",
" inpatient_beds_utilization \n",
" 0.013619 \n",
" \n",
" \n",
" 13 \n",
" inpatient_bed_covid_utilization \n",
" 0.013484 \n",
" \n",
" \n",
" 14 \n",
" adult_icu_bed_utilization \n",
" 0.013188 \n",
" \n",
" \n",
" 15 \n",
" percent_of_inpatients_with_covid \n",
" 0.013176 \n",
" \n",
" \n",
" 16 \n",
" previous_day_admission_adult_covid_confirmed_4... \n",
" 0.012333 \n",
" \n",
" \n",
" 17 \n",
" adult_icu_bed_covid_utilization_denominator \n",
" 0.012239 \n",
" \n",
" \n",
" 18 \n",
" critical_staffing_shortage_today_no \n",
" 0.012147 \n",
" \n",
" \n",
" 19 \n",
" previous_day_admission_adult_covid_suspected \n",
" 0.012105 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Feature Importance\n",
"0 staffed_icu_adult_patients_confirmed_covid 0.026500\n",
"1 staffed_icu_adult_patients_confirmed_and_suspe... 0.025898\n",
"2 adult_icu_bed_covid_utilization_numerator 0.023353\n",
"3 percent_of_inpatients_with_covid_numerator 0.020792\n",
"4 total_adult_patients_hospitalized_confirmed_covid 0.019461\n",
"5 total_adult_patients_hospitalized_confirmed_an... 0.019099\n",
"6 deaths_covid_coverage 0.018980\n",
"7 inpatient_bed_covid_utilization_numerator 0.018109\n",
"8 previous_day_admission_adult_covid_confirmed_5... 0.016019\n",
"9 inpatient_beds_used_covid 0.015906\n",
"10 adult_icu_bed_covid_utilization 0.015842\n",
"11 previous_day_admission_adult_covid_confirmed 0.014609\n",
"12 inpatient_beds_utilization 0.013619\n",
"13 inpatient_bed_covid_utilization 0.013484\n",
"14 adult_icu_bed_utilization 0.013188\n",
"15 percent_of_inpatients_with_covid 0.013176\n",
"16 previous_day_admission_adult_covid_confirmed_4... 0.012333\n",
"17 adult_icu_bed_covid_utilization_denominator 0.012239\n",
"18 critical_staffing_shortage_today_no 0.012147\n",
"19 previous_day_admission_adult_covid_suspected 0.012105"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Add random forest importances to dataframe\n",
"Random_Forest_Importances = pd.DataFrame({\n",
" \"Feature\": X.columns, \n",
" \"Importance\": rfc.feature_importances_\n",
"}).sort_values(\"Importance\", ascending=False).reset_index(drop=True)\n",
"\n",
"# Print top 20 values of the dataframe\n",
"Random_Forest_Importances.head(20)"
]
},
{
"cell_type": "markdown",
"id": "c3fa8ae5",
"metadata": {},
"source": [
"# Decision Trees"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "e00fb574",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.tree import DecisionTreeRegressor"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "8763eacb",
"metadata": {},
"outputs": [],
"source": [
"# Define classification\n",
"dt = DecisionTreeRegressor(random_state=random_state)\n",
"\n",
"# Run prediction using decision trees\n",
"dt.fit(X_train, y_train)\n",
"pred_dt = dt.predict(X_test)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"id": "f870375a",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Random Forest Root Mean Squared Error: 11.982188903538527\n"
]
}
],
"source": [
"# Calculate mean squared error\n",
"mse_dt = mean_squared_error(y_test, pred_dt)\n",
"\n",
"# Calculate root mean squared error\n",
"rmse_dt = np.sqrt(mse_dt) \n",
"\n",
"# Print RMSE\n",
"print(\"Random Forest Root Mean Squared Error:\", rmse_dt)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"id": "08205ed2",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" Feature \n",
" Importance \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" staffed_icu_adult_patients_confirmed_covid \n",
" 0.619773 \n",
" \n",
" \n",
" 1 \n",
" inpatient_beds_coverage \n",
" 0.067250 \n",
" \n",
" \n",
" 2 \n",
" adult_icu_bed_covid_utilization_numerator \n",
" 0.046706 \n",
" \n",
" \n",
" 3 \n",
" total_adult_patients_hospitalized_confirmed_an... \n",
" 0.036031 \n",
" \n",
" \n",
" 4 \n",
" previous_day_admission_adult_covid_confirmed_u... \n",
" 0.029505 \n",
" \n",
" \n",
" 5 \n",
" previous_day_admission_adult_covid_suspected_6... \n",
" 0.024762 \n",
" \n",
" \n",
" 6 \n",
" previous_day_admission_adult_covid_suspected_7... \n",
" 0.023170 \n",
" \n",
" \n",
" 7 \n",
" deaths_covid_coverage \n",
" 0.012864 \n",
" \n",
" \n",
" 8 \n",
" staffed_pediatric_icu_bed_occupancy \n",
" 0.010547 \n",
" \n",
" \n",
" 9 \n",
" total_adult_patients_hospitalized_confirmed_covid \n",
" 0.007443 \n",
" \n",
" \n",
" 10 \n",
" previous_day_admission_adult_covid_suspected_5... \n",
" 0.007377 \n",
" \n",
" \n",
" 11 \n",
" critical_staffing_shortage_today_not_reported \n",
" 0.006928 \n",
" \n",
" \n",
" 12 \n",
" adult_icu_bed_covid_utilization \n",
" 0.006112 \n",
" \n",
" \n",
" 13 \n",
" critical_staffing_shortage_today_yes \n",
" 0.005054 \n",
" \n",
" \n",
" 14 \n",
" adult_icu_bed_utilization_denominator \n",
" 0.004770 \n",
" \n",
" \n",
" 15 \n",
" staffed_adult_icu_bed_occupancy \n",
" 0.004643 \n",
" \n",
" \n",
" 16 \n",
" staffed_icu_pediatric_patients_confirmed_covid... \n",
" 0.004381 \n",
" \n",
" \n",
" 17 \n",
" total_staffed_pediatric_icu_beds \n",
" 0.004099 \n",
" \n",
" \n",
" 18 \n",
" staffed_icu_adult_patients_confirmed_and_suspe... \n",
" 0.003889 \n",
" \n",
" \n",
" 19 \n",
" inpatient_beds_used_covid \n",
" 0.003455 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Feature Importance\n",
"0 staffed_icu_adult_patients_confirmed_covid 0.619773\n",
"1 inpatient_beds_coverage 0.067250\n",
"2 adult_icu_bed_covid_utilization_numerator 0.046706\n",
"3 total_adult_patients_hospitalized_confirmed_an... 0.036031\n",
"4 previous_day_admission_adult_covid_confirmed_u... 0.029505\n",
"5 previous_day_admission_adult_covid_suspected_6... 0.024762\n",
"6 previous_day_admission_adult_covid_suspected_7... 0.023170\n",
"7 deaths_covid_coverage 0.012864\n",
"8 staffed_pediatric_icu_bed_occupancy 0.010547\n",
"9 total_adult_patients_hospitalized_confirmed_covid 0.007443\n",
"10 previous_day_admission_adult_covid_suspected_5... 0.007377\n",
"11 critical_staffing_shortage_today_not_reported 0.006928\n",
"12 adult_icu_bed_covid_utilization 0.006112\n",
"13 critical_staffing_shortage_today_yes 0.005054\n",
"14 adult_icu_bed_utilization_denominator 0.004770\n",
"15 staffed_adult_icu_bed_occupancy 0.004643\n",
"16 staffed_icu_pediatric_patients_confirmed_covid... 0.004381\n",
"17 total_staffed_pediatric_icu_beds 0.004099\n",
"18 staffed_icu_adult_patients_confirmed_and_suspe... 0.003889\n",
"19 inpatient_beds_used_covid 0.003455"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Add decision tree importances to dataframe\n",
"Decision_Tree_Importances = pd.DataFrame({\n",
" \"Feature\": X.columns, \n",
" \"Importance\": dt.feature_importances_\n",
"}).sort_values(\"Importance\", ascending=False).reset_index(drop=True)\n",
"\n",
"# Print top 20 values of the dataframe\n",
"Decision_Tree_Importances.head(20)"
]
},
{
"cell_type": "markdown",
"id": "30bb6cf8",
"metadata": {},
"source": [
"## Linear Regression"
]
},
{
"cell_type": "code",
"execution_count": 26,
"id": "bb38e278",
"metadata": {},
"outputs": [],
"source": [
"from sklearn.linear_model import LinearRegression"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "cae203a5",
"metadata": {},
"outputs": [],
"source": [
"# Define classification\n",
"lr = LinearRegression()\n",
"\n",
"# Run prediction using linear regression\n",
"lr.fit(X_train, y_train)\n",
"pred_lr = lr.predict(X_test)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "ac3c270f",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Linear Regression Root Mean Squared Error: 11.579237777122788\n"
]
}
],
"source": [
"# Calculate mean squared error\n",
"mse_lr = mean_squared_error(y_test, pred_lr)\n",
"\n",
"# Calculate root mean squared error\n",
"rmse_lr = np.sqrt(mse_lr)\n",
"\n",
"# Print RMSE\n",
"print(\"Linear Regression Root Mean Squared Error:\", rmse_lr)"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "416424f9",
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" Feature \n",
" Coefficient \n",
" Abs_Coefficient \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" critical_staffing_shortage_today_no \n",
" -1.407022e+12 \n",
" 1.407022e+12 \n",
" \n",
" \n",
" 1 \n",
" critical_staffing_shortage_today_not_reported \n",
" -1.399735e+12 \n",
" 1.399735e+12 \n",
" \n",
" \n",
" 2 \n",
" critical_staffing_shortage_anticipated_within_... \n",
" 1.342986e+12 \n",
" 1.342986e+12 \n",
" \n",
" \n",
" 3 \n",
" critical_staffing_shortage_anticipated_within_... \n",
" 1.129891e+12 \n",
" 1.129891e+12 \n",
" \n",
" \n",
" 4 \n",
" critical_staffing_shortage_anticipated_within_... \n",
" 4.675849e+11 \n",
" 4.675849e+11 \n",
" \n",
" \n",
" 5 \n",
" critical_staffing_shortage_today_yes \n",
" -3.752257e+11 \n",
" 3.752257e+11 \n",
" \n",
" \n",
" 6 \n",
" inpatient_beds_coverage \n",
" 1.577878e+02 \n",
" 1.577878e+02 \n",
" \n",
" \n",
" 7 \n",
" inpatient_bed_covid_utilization_coverage \n",
" -1.557128e+02 \n",
" 1.557128e+02 \n",
" \n",
" \n",
" 8 \n",
" adult_icu_bed_utilization_numerator \n",
" 1.069342e+02 \n",
" 1.069342e+02 \n",
" \n",
" \n",
" 9 \n",
" staffed_adult_icu_bed_occupancy \n",
" -1.059861e+02 \n",
" 1.059861e+02 \n",
" \n",
" \n",
" 10 \n",
" percent_of_inpatients_with_covid_coverage \n",
" 1.012391e+02 \n",
" 1.012391e+02 \n",
" \n",
" \n",
" 11 \n",
" previous_day_admission_adult_covid_confirmed_5... \n",
" -1.000766e+02 \n",
" 1.000766e+02 \n",
" \n",
" \n",
" 12 \n",
" all_pediatric_inpatient_beds_coverage \n",
" -8.345058e+01 \n",
" 8.345058e+01 \n",
" \n",
" \n",
" 13 \n",
" inpatient_beds_utilization_coverage \n",
" -8.208582e+01 \n",
" 8.208582e+01 \n",
" \n",
" \n",
" 14 \n",
" icu_patients_confirmed_influenza_coverage \n",
" -7.412975e+01 \n",
" 7.412975e+01 \n",
" \n",
" \n",
" 15 \n",
" staffed_icu_adult_patients_confirmed_and_suspe... \n",
" -7.337561e+01 \n",
" 7.337561e+01 \n",
" \n",
" \n",
" 16 \n",
" staffed_icu_adult_patients_confirmed_covid_cov... \n",
" 7.295163e+01 \n",
" 7.295163e+01 \n",
" \n",
" \n",
" 17 \n",
" previous_day_admission_adult_covid_suspected_7... \n",
" -7.136121e+01 \n",
" 7.136121e+01 \n",
" \n",
" \n",
" 18 \n",
" staffed_pediatric_icu_bed_occupancy_coverage \n",
" 7.107138e+01 \n",
" 7.107138e+01 \n",
" \n",
" \n",
" 19 \n",
" adult_icu_bed_covid_utilization_numerator \n",
" -7.093283e+01 \n",
" 7.093283e+01 \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Feature Coefficient \\\n",
"0 critical_staffing_shortage_today_no -1.407022e+12 \n",
"1 critical_staffing_shortage_today_not_reported -1.399735e+12 \n",
"2 critical_staffing_shortage_anticipated_within_... 1.342986e+12 \n",
"3 critical_staffing_shortage_anticipated_within_... 1.129891e+12 \n",
"4 critical_staffing_shortage_anticipated_within_... 4.675849e+11 \n",
"5 critical_staffing_shortage_today_yes -3.752257e+11 \n",
"6 inpatient_beds_coverage 1.577878e+02 \n",
"7 inpatient_bed_covid_utilization_coverage -1.557128e+02 \n",
"8 adult_icu_bed_utilization_numerator 1.069342e+02 \n",
"9 staffed_adult_icu_bed_occupancy -1.059861e+02 \n",
"10 percent_of_inpatients_with_covid_coverage 1.012391e+02 \n",
"11 previous_day_admission_adult_covid_confirmed_5... -1.000766e+02 \n",
"12 all_pediatric_inpatient_beds_coverage -8.345058e+01 \n",
"13 inpatient_beds_utilization_coverage -8.208582e+01 \n",
"14 icu_patients_confirmed_influenza_coverage -7.412975e+01 \n",
"15 staffed_icu_adult_patients_confirmed_and_suspe... -7.337561e+01 \n",
"16 staffed_icu_adult_patients_confirmed_covid_cov... 7.295163e+01 \n",
"17 previous_day_admission_adult_covid_suspected_7... -7.136121e+01 \n",
"18 staffed_pediatric_icu_bed_occupancy_coverage 7.107138e+01 \n",
"19 adult_icu_bed_covid_utilization_numerator -7.093283e+01 \n",
"\n",
" Abs_Coefficient \n",
"0 1.407022e+12 \n",
"1 1.399735e+12 \n",
"2 1.342986e+12 \n",
"3 1.129891e+12 \n",
"4 4.675849e+11 \n",
"5 3.752257e+11 \n",
"6 1.577878e+02 \n",
"7 1.557128e+02 \n",
"8 1.069342e+02 \n",
"9 1.059861e+02 \n",
"10 1.012391e+02 \n",
"11 1.000766e+02 \n",
"12 8.345058e+01 \n",
"13 8.208582e+01 \n",
"14 7.412975e+01 \n",
"15 7.337561e+01 \n",
"16 7.295163e+01 \n",
"17 7.136121e+01 \n",
"18 7.107138e+01 \n",
"19 7.093283e+01 "
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Add linear regression coefficients to dataframe\n",
"Linear_Regression_Coefficients = pd.DataFrame({\n",
" 'Feature': X.columns,\n",
" 'Coefficient': lr.coef_\n",
"}).sort_values(by='Coefficient', ascending=False).reset_index(drop=True)\n",
"\n",
"# Create an absolute value column and sort by that column\n",
"Linear_Regression_Coefficients['Abs_Coefficient'] = Linear_Regression_Coefficients['Coefficient'].apply(lambda x: abs(float(x)))\n",
"Linear_Regression_Coefficients = Linear_Regression_Coefficients.sort_values(by='Abs_Coefficient', ascending=False).head(20).reset_index(drop=True)\n",
"\n",
"# Print top 20 values of the dataframe\n",
"Linear_Regression_Coefficients.head(20)"
]
},
{
"cell_type": "markdown",
"id": "bcd0249e",
"metadata": {},
"source": [
"# Results"
]
},
{
"cell_type": "markdown",
"id": "ba850582",
"metadata": {},
"source": [
"## Key Findings and Optimal Model\n",
"The results show the reliability of the models as follow: \n",
"* The Decision Tree model is the most reliable, with a root mean squared error of 11.2 \n",
"* The Random Forest model is next with a RMSE of 12.3 \n",
"* The Linear Regression model is last with a RMSE of 15.7. \n",
"\n",
"The most common features among the algorithms are the following:\n",
"* staffed_icu_adult_patients_confirmed_covid\n",
"* adult_icu_bed_covid_utilization_numerator\n",
"* total_adult_patients_hospitalized_confirmed_covid\n",
"* total_adult_patients_hospitalized_confirmed_and_suspected_covid\n",
"* deaths_covid_coverage\n",
"* inpatient_beds_coverage\n",
"\n",
"One potential issue is that the features extracted from Linear Regression are assumed to be more important the higher the coefficient is. This is not necessarily the case and can skew the results. More research may be needed."
]
},
{
"cell_type": "markdown",
"id": "0fa7b66e",
"metadata": {},
"source": [
"## Model Accuracy"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "1260e6c5",
"metadata": {},
"outputs": [],
"source": [
"# Create dataframe using model results\n",
"result_models = ['Random Forest', 'Decision Tree', 'Linear Regression']\n",
"result_stats = [rmse_rf, rmse_dt, rmse_lr]\n",
"results = pd.DataFrame([result_models, result_stats])\n",
"results = results.T.rename(columns={0:'Models', 1:'RMSE'}).set_index('Models').sort_values(by='RMSE')"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "428ff430",
"metadata": {},
"outputs": [
{
"data": {
"image/png": "",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Plot the results dataframe with appropriate labels\n",
"ax = results.plot(kind='bar', legend=False)\n",
"\n",
"ax.set_title('Root Mean Squared Error for Models')\n",
"ax.set_xlabel('Models')\n",
"ax.set_ylabel('Root Mean Squared Error')\n",
"\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"id": "58c5805d",
"metadata": {},
"source": [
"## Top Features"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "84f75fbe",
"metadata": {},
"outputs": [],
"source": [
"from collections import Counter"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "4dbe690e",
"metadata": {},
"outputs": [],
"source": [
"# Create temporary dataframes with top 10 features from each\n",
"RF_top10 = Random_Forest_Importances[['Feature']].rename(columns={'Feature':'Random Forest'}).head(10)\n",
"DT_top10 = Decision_Tree_Importances[['Feature']].rename(columns={'Feature':'Decision Tree'}).head(10)\n",
"LR_top10 = Linear_Regression_Coefficients[['Feature']].rename(columns={'Feature':'Linear Regression'}).head(10)\n",
"\n",
"# Combine dataframes into a top 10 list for each\n",
"top10 = RF_top10.merge(DT_top10, left_index=True, right_index=True, how='outer')\n",
"top10 = top10.merge(LR_top10, left_index=True, right_index=True, how='outer')"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "2bef7ad6",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" \n",
" Random Forest \n",
" Decision Tree \n",
" Linear Regression \n",
" \n",
" \n",
" \n",
" \n",
" 0 \n",
" staffed_icu_adult_patients_confirmed_covid \n",
" staffed_icu_adult_patients_confirmed_covid \n",
" critical_staffing_shortage_today_no \n",
" \n",
" \n",
" 1 \n",
" staffed_icu_adult_patients_confirmed_and_suspe... \n",
" inpatient_beds_coverage \n",
" critical_staffing_shortage_today_not_reported \n",
" \n",
" \n",
" 2 \n",
" adult_icu_bed_covid_utilization_numerator \n",
" adult_icu_bed_covid_utilization_numerator \n",
" critical_staffing_shortage_anticipated_within_... \n",
" \n",
" \n",
" 3 \n",
" percent_of_inpatients_with_covid_numerator \n",
" total_adult_patients_hospitalized_confirmed_an... \n",
" critical_staffing_shortage_anticipated_within_... \n",
" \n",
" \n",
" 4 \n",
" total_adult_patients_hospitalized_confirmed_covid \n",
" previous_day_admission_adult_covid_confirmed_u... \n",
" critical_staffing_shortage_anticipated_within_... \n",
" \n",
" \n",
" 5 \n",
" total_adult_patients_hospitalized_confirmed_an... \n",
" previous_day_admission_adult_covid_suspected_6... \n",
" critical_staffing_shortage_today_yes \n",
" \n",
" \n",
" 6 \n",
" deaths_covid_coverage \n",
" previous_day_admission_adult_covid_suspected_7... \n",
" inpatient_beds_coverage \n",
" \n",
" \n",
" 7 \n",
" inpatient_bed_covid_utilization_numerator \n",
" deaths_covid_coverage \n",
" inpatient_bed_covid_utilization_coverage \n",
" \n",
" \n",
" 8 \n",
" previous_day_admission_adult_covid_confirmed_5... \n",
" staffed_pediatric_icu_bed_occupancy \n",
" adult_icu_bed_utilization_numerator \n",
" \n",
" \n",
" 9 \n",
" inpatient_beds_used_covid \n",
" total_adult_patients_hospitalized_confirmed_covid \n",
" staffed_adult_icu_bed_occupancy \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Random Forest \\\n",
"0 staffed_icu_adult_patients_confirmed_covid \n",
"1 staffed_icu_adult_patients_confirmed_and_suspe... \n",
"2 adult_icu_bed_covid_utilization_numerator \n",
"3 percent_of_inpatients_with_covid_numerator \n",
"4 total_adult_patients_hospitalized_confirmed_covid \n",
"5 total_adult_patients_hospitalized_confirmed_an... \n",
"6 deaths_covid_coverage \n",
"7 inpatient_bed_covid_utilization_numerator \n",
"8 previous_day_admission_adult_covid_confirmed_5... \n",
"9 inpatient_beds_used_covid \n",
"\n",
" Decision Tree \\\n",
"0 staffed_icu_adult_patients_confirmed_covid \n",
"1 inpatient_beds_coverage \n",
"2 adult_icu_bed_covid_utilization_numerator \n",
"3 total_adult_patients_hospitalized_confirmed_an... \n",
"4 previous_day_admission_adult_covid_confirmed_u... \n",
"5 previous_day_admission_adult_covid_suspected_6... \n",
"6 previous_day_admission_adult_covid_suspected_7... \n",
"7 deaths_covid_coverage \n",
"8 staffed_pediatric_icu_bed_occupancy \n",
"9 total_adult_patients_hospitalized_confirmed_covid \n",
"\n",
" Linear Regression \n",
"0 critical_staffing_shortage_today_no \n",
"1 critical_staffing_shortage_today_not_reported \n",
"2 critical_staffing_shortage_anticipated_within_... \n",
"3 critical_staffing_shortage_anticipated_within_... \n",
"4 critical_staffing_shortage_anticipated_within_... \n",
"5 critical_staffing_shortage_today_yes \n",
"6 inpatient_beds_coverage \n",
"7 inpatient_bed_covid_utilization_coverage \n",
"8 adult_icu_bed_utilization_numerator \n",
"9 staffed_adult_icu_bed_occupancy "
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Print significant features on models \n",
"top10"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "b9a796f1",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"staffed_icu_adult_patients_confirmed_covid\n",
"adult_icu_bed_covid_utilization_numerator\n",
"total_adult_patients_hospitalized_confirmed_covid\n",
"total_adult_patients_hospitalized_confirmed_and_suspected_covid\n",
"deaths_covid_coverage\n",
"inpatient_beds_coverage\n"
]
}
],
"source": [
"# Combine top10 dataframes into a single dataframe\n",
"all_entries = pd.concat([top10['Random Forest'], top10['Decision Tree'], top10['Linear Regression']])\n",
"\n",
"# Count the frequency of each entry\n",
"counter = Counter(all_entries)\n",
"\n",
"# Sort counter by value in descending order and get the most common entries\n",
"most_common_entries = counter.most_common()\n",
"\n",
"# Get the highest count (the count of the first entry in the sorted list)\n",
"highest_count = most_common_entries[0][1]\n",
"\n",
"# Print only the most common entries (those with a count equal to highest_count)\n",
"for entry, count in most_common_entries:\n",
" if count == highest_count:\n",
" print(entry)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e4e2235e",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.11"
}
},
"nbformat": 4,
"nbformat_minor": 5
}