import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn import metrics
from tabulate import tabulate
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
import gc
import re
import dash
from dash import dcc, html, Input, Output
import pandas as pd
import plotly.express as px
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from ipywidgets import interact, widgets
from IPython.display import display
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from IPython.display import clear_output


def enable_plotly_in_cell():
  import IPython
  from plotly.offline import init_notebook_mode
  display(IPython.core.display.HTML('''<script src="/static/components/requirejs/require.js"></script>'''))
  init_notebook_mode(connected=False)


index = pd.read_csv('index.csv')
index = index[['country_code','country_name']]
index.rename(columns={'country_code': 'location_key'}, inplace=True)


index = index[index['location_key'].str.len() == 2]
index = index.drop_duplicates()


# Importing epidemiology dataset
epidemiology = pd.read_csv("epidemiology.csv")


# Converting date variable into date format
epidemiology['date'] = pd.to_datetime(epidemiology['date'])


# Define the start and end dates
start_date = pd.to_datetime('2020-03-11')
end_date = pd.to_datetime('2022-03-11')

# Filter the DataFrame based on the date range
epidemiology = epidemiology[(epidemiology['date'] >= start_date) & (epidemiology['date'] <= end_date)]


grouped = epidemiology.groupby('location_key')[['cumulative_confirmed', 'cumulative_deceased', 
                                                'cumulative_recovered', 'cumulative_tested']].max()
grouped = grouped.reset_index()


filtered_grouped = grouped[grouped['location_key'].str.len() == 2]


missing_recovered = filtered_grouped[filtered_grouped['cumulative_recovered'].isna()]['location_key']


country_cumulative_recovered = {}

for code in missing_recovered:
    cumulative_recovered = grouped.loc[grouped['location_key'].str.contains('^' + code), 'cumulative_recovered'].sum()
    country_cumulative_recovered[code] = cumulative_recovered

# Convert dictionary to DataFrame
country_cumulative_recovered = pd.DataFrame.from_dict(country_cumulative_recovered, orient='index', columns=['cumulative_recovered'])


missing_tested = filtered_grouped[filtered_grouped['cumulative_tested'].isna()]['location_key']


country_cumulative_tested = {}

for test in missing_tested:
    cumulative_tested = grouped.loc[grouped['location_key'].str.contains('^' + test), 'cumulative_tested'].sum()
    country_cumulative_tested[test] = cumulative_tested

# Convert dictionary to DataFrame
country_cumulative_tested = pd.DataFrame.from_dict(country_cumulative_tested, orient='index', columns=['cumulative_tested'])


pattern = '^(' + '|'.join(map(re.escape, missing_tested)) + ')'
filtered_grouped = filtered_grouped[~filtered_grouped['location_key'].str.contains(pattern)]


filtered_grouped = filtered_grouped[filtered_grouped['location_key'] != 'EH']


filtered_grouped.drop('cumulative_recovered', axis=1, inplace=True)


filtered_grouped.reset_index()
pd.set_option('display.float_format', '{:.2f}'.format)


country = filtered_grouped


demographics = pd.read_csv("demographics.csv")


country = country.merge(demographics, on='location_key')


columns_to_drop = ['population_largest_city', 'population_clustered', 'population_male', 'population_female',
                   'population_age_00_09', 'population_age_10_19', 'population_age_20_29', 'population_age_30_39',
                   'population_age_40_49', 'population_age_50_59', 'population_age_60_69', 'population_age_70_79',
                   'population_age_80_and_older']

country.drop(columns_to_drop, axis=1, inplace=True)


country.dropna(subset=['population_rural', 'population_urban', 'human_development_index'], inplace=True)


economy = pd.read_csv("economy.csv")


country = country.merge(economy, on='location_key')


country.dropna(subset=['human_capital_index'], inplace=True)


geography = pd.read_csv("geography.csv")


country = country.merge(geography, on='location_key')


columns_to_drop = ['openstreetmap_id', 'elevation_m', 'area_rural_sq_km', 'area_urban_sq_km' ]
country.drop(columns_to_drop, axis=1, inplace=True)


health = pd.read_csv("health.csv")


country = country.merge(health, on='location_key')


columns_to_drop = ['hospital_beds_per_1000', 'adult_male_mortality_rate',
                   'infant_mortality_rate', 'adult_female_mortality_rate']
country.drop(columns_to_drop, axis=1, inplace=True)


country.dropna(subset=['smoking_prevalence', 'physicians_per_1000','health_expenditure_usd',
                       'out_of_pocket_health_expenditure_usd'], inplace=True)


hospitalizations = pd.read_csv("hospitalizations.csv")


hospitalizations = hospitalizations.groupby('location_key')[['cumulative_hospitalized_patients', 'cumulative_intensive_care_patients', 
                                                'cumulative_ventilator_patients']].max()
hospitalizations = hospitalizations.reset_index()


hosp_country = hospitalizations[hospitalizations['location_key'].str.len() == 2]


chp = hosp_country[hosp_country['cumulative_hospitalized_patients'].isna()]['location_key']


chp_hosp = {}

for cum in chp:
    chp_agg = hospitalizations.loc[hospitalizations['location_key'].str.contains('^' + cum), 'cumulative_hospitalized_patients'].sum()
    chp_hosp[cum] = chp_agg

# Convert dictionary to DataFrame
chp_hosp = pd.DataFrame.from_dict(chp_hosp, orient='index', columns=['chp_agg'])
chp_hosp.reset_index(inplace = True)
chp_hosp.rename(columns={'index': 'location_key'}, inplace=True)


cic = hosp_country[hosp_country['cumulative_intensive_care_patients'].isna()]['location_key']


cic_hosp = {}

for cum in cic:
    cic_agg = hospitalizations.loc[hospitalizations['location_key'].str.contains('^' + cum), 'cumulative_intensive_care_patients'].sum()
    cic_hosp[cum] = cic_agg

# Convert dictionary to DataFrame
cic_hosp = pd.DataFrame.from_dict(cic_hosp, orient='index', columns=['cic_agg'])
cic_hosp.reset_index(inplace = True)
cic_hosp.rename(columns={'index': 'location_key'}, inplace=True)


hosp_country.loc[hosp_country['location_key'] == 'CH', 'cumulative_intensive_care_patients'] = 730


cvp = hosp_country[hosp_country['cumulative_ventilator_patients'].isna()]['location_key']


cvp_hosp = {}

for cum in cvp:
    cvp_agg = hospitalizations.loc[hospitalizations['location_key'].str.contains('^' + cum), 'cumulative_ventilator_patients'].sum()
    cvp_hosp[cum] = cvp_agg

# Convert dictionary to DataFrame
cvp_hosp = pd.DataFrame.from_dict(cvp_hosp, orient='index', columns=['cvp_agg'])
cvp_hosp.reset_index(inplace = True)
cvp_hosp.rename(columns={'index': 'location_key'}, inplace=True)


hosp_country.drop('cumulative_ventilator_patients', axis=1, inplace=True)


hosp_country.dropna(subset=['cumulative_hospitalized_patients', 'cumulative_intensive_care_patients'], inplace=True)


hospitalizations = hosp_country.merge(cvp_hosp, on='location_key', how='left')


hospitalizations = hospitalizations.merge(chp_hosp, on='location_key', how='left')


hospitalizations = hospitalizations.merge(cic_hosp, on='location_key', how='left')


# Importing government response dataset
gov_response = pd.read_csv("oxford-government-response.csv")


# Converting date variable into date format
gov_response['date'] = pd.to_datetime(gov_response['date'])


# Define the start and end dates
start_date = pd.to_datetime('2020-03-11')
end_date = pd.to_datetime('2022-03-11')

# Filter the DataFrame based on the date range
gov_response = gov_response[(gov_response['date'] >= start_date) & (gov_response['date'] <= end_date)]


grouped = gov_response.groupby('location_key').max()
grouped = grouped.reset_index()


gov_response = grouped[grouped['location_key'].str.len() == 2]


gov_response_merge = gov_response[['location_key','stringency_index']]


country = country.merge(gov_response_merge, on='location_key', how='left')


# Importing mobility dataset
mobility = pd.read_csv("mobility.csv")


# Converting date variable into date format
mobility['date'] = pd.to_datetime(mobility['date'])


# Define the start and end dates
start_date = pd.to_datetime('2020-03-11')
end_date = pd.to_datetime('2022-03-11')

# Filter the DataFrame based on the date range
mobility = mobility[(mobility['date'] >= start_date) & (mobility['date'] <= end_date)]


mob = mobility.groupby('location_key')[['mobility_retail_and_recreation', 'mobility_grocery_and_pharmacy', 
                                                'mobility_parks', 'mobility_transit_stations','mobility_workplaces',
                                       'mobility_residential']].mean()
mob = mob.reset_index()


filtered_mob = mob[mob['location_key'].str.len() == 2]


country = country.merge(filtered_mob, on='location_key')


# Importing mobility dataset
weather = pd.read_csv("weather.csv")


# Converting date variable into date format
weather['date'] = pd.to_datetime(weather['date'])


# Define the start and end dates
start_date = pd.to_datetime('2020-03-11')
end_date = pd.to_datetime('2022-03-11')

# Filter the DataFrame based on the date range
weather = weather[(weather['date'] >= start_date) & (weather['date'] <= end_date)]


wet = weather.groupby('location_key')[['average_temperature_celsius', 'rainfall_mm', 
                                                'dew_point', 'relative_humidity']].mean()
wet = wet.reset_index()


filtered_wet = wet[wet['location_key'].str.len() == 2]


country = country.merge(filtered_wet, on='location_key')


columns_to_divide = ['cumulative_confirmed', 'cumulative_deceased', 
                     'cumulative_tested', 'population_rural', 'population_urban']
country[columns_to_divide] = country[columns_to_divide].div(country['population'], axis=0)


columns_to_drop = ['population', 'gdp_usd', 'out_of_pocket_health_expenditure_usd', 
                   'dew_point','rainfall_mm']
country.drop(columns_to_drop, axis=1, inplace=True)


del epidemiology, demographics, economy, geography, health,filtered_grouped, grouped,filtered_mob,mob,wet,filtered_wet


gc.collect();


country.set_index('location_key', inplace=True)


country_cluster = country[['cumulative_confirmed', 'cumulative_deceased', 'cumulative_tested',
       'population_rural', 'population_urban', 'population_density',
       'human_development_index', 'gdp_per_capita_usd', 'human_capital_index','area_sq_km', 'life_expectancy',
       'smoking_prevalence', 'diabetes_prevalence', 'pollution_mortality_rate',
       'comorbidity_mortality_rate', 'nurses_per_1000', 'physicians_per_1000',
       'health_expenditure_usd','stringency_index', 'mobility_retail_and_recreation',
       'mobility_grocery_and_pharmacy', 'mobility_parks',
       'mobility_transit_stations', 'mobility_workplaces',
       'mobility_residential', 'average_temperature_celsius',
       'relative_humidity']]


# Scaling the country data
scaler = preprocessing.StandardScaler()
scaled_country = scaler.fit_transform(country_cluster)
scaled_country= pd.DataFrame(scaled_country)

# Changing the colnames to its original value
scaled_country.columns = country_cluster.columns

# Checking for the scaled data
scaled_country.head()


# Initializing empty list for SSE calculation
sse = {}
for k in range(1, 15): 
# Initialize KMeans with k clusters
    kmeans = KMeans(n_clusters=k, random_state=296)
# Fit KMeans on the normalized dataset
    kmeans.fit(scaled_country)
    sse[k] = kmeans.inertia_
# Add the plot title "The Elbow Method"
plt.title('The Elbow Method')
# Add X-axis label "k"
plt.xlabel('k')
# Add Y-axis label "SSE"
plt.ylabel('SSE')
sns.pointplot(x=list(sse.keys()), y=list(sse.values()));


# Number of cluster equals to 3
kmeans3 = KMeans(n_clusters=3, random_state=296)
kmeans3.fit(scaled_country)
cluster_labels3 = kmeans3.labels_

k_means3 = scaled_country.assign(Cluster = cluster_labels3)
k_means3.groupby(['Cluster']).agg({
'cumulative_confirmed': ['mean'],
'cumulative_deceased':['mean'],
'cumulative_tested': ['mean'],
'population_rural': ['mean'],
'population_urban': ['mean'],
'population_density': ['mean'],
'human_development_index': ['mean'],
'gdp_per_capita_usd': ['mean'],
'human_capital_index': ['mean'],
'area_sq_km': ['mean'],
'life_expectancy': ['mean'],
'smoking_prevalence': ['mean'],
'diabetes_prevalence': ['mean'],
'pollution_mortality_rate': ['mean'],
'comorbidity_mortality_rate': ['mean'],
'nurses_per_1000': ['mean'],
'physicians_per_1000': ['mean'],
'health_expenditure_usd': ['mean'],
'stringency_index': ['mean'],    
'mobility_retail_and_recreation': ['mean'],
'mobility_grocery_and_pharmacy': ['mean'],
'mobility_parks': ['mean'],
'mobility_transit_stations': ['mean'],
'mobility_workplaces': ['mean'],
'mobility_residential': ['mean'],
'average_temperature_celsius': ['mean'],
'relative_humidity': ['mean','count'],
}).round(2)


# Number of cluster equals to 4
kmeans4 = KMeans(n_clusters=4, random_state=296)
kmeans4.fit(scaled_country)
cluster_labels4 = kmeans4.labels_

k_means4 = scaled_country.assign(Cluster = cluster_labels4)
k_means4.groupby(['Cluster']).agg({
'cumulative_confirmed': ['mean'],
'cumulative_deceased':['mean'],
'cumulative_tested': ['mean'],
'population_rural': ['mean'],
'population_urban': ['mean'],
'population_density': ['mean'],
'human_development_index': ['mean'],
'gdp_per_capita_usd': ['mean'],
'human_capital_index': ['mean'],
'area_sq_km': ['mean'],
'life_expectancy': ['mean'],
'smoking_prevalence': ['mean'],
'diabetes_prevalence': ['mean'],
'pollution_mortality_rate': ['mean'],
'comorbidity_mortality_rate': ['mean'],
'nurses_per_1000': ['mean'],
'physicians_per_1000': ['mean'],
'health_expenditure_usd': ['mean'],
'mobility_retail_and_recreation': ['mean'],
'mobility_grocery_and_pharmacy': ['mean'],
'mobility_parks': ['mean'],
'mobility_transit_stations': ['mean'],
'mobility_workplaces': ['mean'],
'mobility_residential': ['mean'],
'average_temperature_celsius': ['mean'],
'relative_humidity': ['mean','count'],
}).round(2)


# Number of cluster equals to 5
kmeans5 = KMeans(n_clusters=5, random_state=296)
kmeans5.fit(scaled_country)
cluster_labels5 = kmeans5.labels_

k_means5 = scaled_country.assign(Cluster = cluster_labels5)
k_means5.groupby(['Cluster']).agg({
'cumulative_confirmed': ['mean'],
'cumulative_deceased':['mean'],
'cumulative_tested': ['mean'],
'population_rural': ['mean'],
'population_urban': ['mean'],
'population_density': ['mean'],
'human_development_index': ['mean'],
'gdp_per_capita_usd': ['mean'],
'human_capital_index': ['mean'],
'area_sq_km': ['mean'],
'life_expectancy': ['mean'],
'smoking_prevalence': ['mean'],
'diabetes_prevalence': ['mean'],
'pollution_mortality_rate': ['mean'],
'comorbidity_mortality_rate': ['mean'],
'nurses_per_1000': ['mean'],
'physicians_per_1000': ['mean'],
'health_expenditure_usd': ['mean'],
'mobility_retail_and_recreation': ['mean'],
'mobility_grocery_and_pharmacy': ['mean'],
'mobility_parks': ['mean'],
'mobility_transit_stations': ['mean'],
'mobility_workplaces': ['mean'],
'mobility_residential': ['mean'],
'average_temperature_celsius': ['mean'],
'relative_humidity': ['mean','count'],
}).round(2)


# Assigning back the cluster label to country_cluster dataset columns
cluster = country_cluster.assign(cluster = cluster_labels3)
cluster['cluster'] = cluster['cluster'].astype('category')

# Renaming the cluster
cluster['cluster'] = cluster['cluster'].cat.rename_categories({
    0:'Vulnerable Rural Health Deficit',
    1:'Urban Diabetes Burden & Residential Refuges',
    2:'Resilient Urban Health Capitals',
})


cluster.reset_index(inplace=True)


identifier = pd.DataFrame()
identifier[['latitude','longitude']] = country[['latitude','longitude']]
identifier.reset_index(inplace=True)


identifier = identifier.merge(index, on='location_key')


cluster = cluster.merge(identifier, on='location_key')


cluster = cluster.sort_values(by=['cluster', 'country_name'], ascending=[True, True])


desired_columns_order = ['location_key', 'country_name', 'cluster', 'latitude', 'longitude',
                         'cumulative_confirmed', 'cumulative_deceased', 'cumulative_tested',
                         'population_rural', 'population_urban', 'population_density',
                         'human_development_index', 'gdp_per_capita_usd',
                         'human_capital_index', 'area_sq_km', 'life_expectancy',
                         'smoking_prevalence', 'diabetes_prevalence',
                         'pollution_mortality_rate', 'comorbidity_mortality_rate',
                         'nurses_per_1000', 'physicians_per_1000',
                         'health_expenditure_usd', 'stringency_index',
                         'mobility_retail_and_recreation', 'mobility_grocery_and_pharmacy',
                         'mobility_parks', 'mobility_transit_stations', 'mobility_workplaces',
                         'mobility_residential', 'average_temperature_celsius',
                         'relative_humidity']

cluster = cluster.reindex(columns=desired_columns_order)
cluster.reset_index(inplace=True)
cluster.drop('index', axis=1, inplace=True)


enable_plotly_in_cell()

available_columns = [col for col in cluster.columns if col not in ['latitude', 'longitude', 'location_key', 'country_name', 'cluster']]

# Create the scatter plot
def create_scatter_plot(x_axis_col, y_axis_col):
    fig = px.scatter(
        cluster,
        x=x_axis_col,
        y=y_axis_col,
        color='cluster',
        hover_data=['country_name', x_axis_col, y_axis_col],  # Data shown in hover tooltip
        labels={
            'cluster': 'Cluster',
            'country_name': 'Country',
            x_axis_col: x_axis_col,
            y_axis_col: y_axis_col
        },
        title=f"Scatter plot of {y_axis_col} vs {x_axis_col}",
    )

    # Update axis labels
    fig.update_xaxes(title_text=x_axis_col)
    fig.update_yaxes(title_text=y_axis_col)

    # Set the legend position to bottom center
    fig.update_layout(legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='center', x=0.5))

    return fig

# Create interactive dropdowns
x_dropdown = widgets.Dropdown(options=available_columns, value=available_columns[0], description='X-axis:')
y_dropdown = widgets.Dropdown(options=available_columns, value=available_columns[1], description='Y-axis:')

# Display the dropdowns
display(x_dropdown, y_dropdown)

# Create an Output widget for the plot
output = widgets.Output()
display(output)

# Initial plot
initial_plot = create_scatter_plot(x_dropdown.value, y_dropdown.value)
with output:
    clear_output(wait=True)
    display(initial_plot)

# Define the update function
def update_plot(change):
    scatter_plot = create_scatter_plot(x_dropdown.value, y_dropdown.value)
    with output:
        clear_output(wait=True)
        display(scatter_plot)

# Observe changes in dropdown values
x_dropdown.observe(update_plot, names='value')
y_dropdown.observe(update_plot, names='value')

Dropdown(description='X-axis:', options=('cumulative_confirmed', 'cumulative_deceased', 'cumulative_tested', '…

Dropdown(description='Y-axis:', index=1, options=('cumulative_confirmed', 'cumulative_deceased', 'cumulative_t…

Output()


enable_plotly_in_cell()

# Scatter map
map_fig = px.scatter_geo(
    cluster,
    lat='latitude',
    lon='longitude',
    color='cluster',
    hover_data=['country_name', 'cluster'],
    labels={'cluster': 'Cluster'},
    title='Geographical Map with Cluster Count',
    custom_data=['cluster']  # Include the 'cluster' column in the custom data for clickData
)

# Set the legend position to bottom center
map_fig.update_layout(legend=dict(orientation='h', yanchor='bottom', y=1.02, xanchor='center', x=0.5))

# Create a Scattermapbox trace for highlighting the selected cluster
selected_cluster_trace = go.Scattergeo(
    lat=[],
    lon=[],
    mode='markers',
    marker=dict(
        size=14,
        color='red',
        opacity=0.6
    ),
    customdata=[],
    hoverinfo='skip'  # Disable hover info for the highlighted cluster
)

# Add the selected cluster trace to the figure
map_fig.add_trace(selected_cluster_trace)

# Handle click event
def update_scatter_map(trace, points, selector):
    if points.point_inds:
        selected_cluster = map_fig.data[0].customdata[points.point_inds[0]]

        # Update the selected cluster in the highlighted trace
        selected_cluster_trace.lat = [cluster.loc[cluster['cluster'] == selected_cluster, 'latitude'].iloc[0]]
        selected_cluster_trace.lon = [cluster.loc[cluster['cluster'] == selected_cluster, 'longitude'].iloc[0]]
        selected_cluster_trace.customdata = [selected_cluster]

        # Update the layout to show the highlighted cluster
        map_fig.update_traces(selectedpoints=[0])

# Attach the click event handler
map_fig.for_each_trace(lambda trace: trace.on_click(update_scatter_map))

# Show the plot
map_fig.show()


vrhd_cluster = cluster[cluster['cluster'] == 'Vulnerable Rural Health Deficit'][['country_name']]
vrhd_cluster.columns = ['Vulnerable Rural Health Deficit Cluster Country Members']


country_names = vrhd_cluster['Vulnerable Rural Health Deficit Cluster Country Members']

# Preprocess the text to replace spaces in multi-word country names with underscores
country_names = country_names.str.replace(' ', '_')

# WordCloud customization
wordcloud = WordCloud(
    width=800,
    height=400,
    background_color='white',
    colormap='viridis',
    contour_color='steelblue',
    contour_width=2,
    max_font_size=35,
    min_word_length=3,
    prefer_horizontal=1.0,
    random_state=42
).generate(' '.join(country_names))

# Preprocess the word cloud image to display underscores as spaces
wordcloud_image = wordcloud.to_image().convert('RGBA')
wordcloud_image_data = wordcloud_image.getdata()
new_image_data = [(r, g, b, a) if r != 255 and g != 255 and b != 255 else (255, 255, 255, 0) for r, g, b, a in wordcloud_image_data]
wordcloud_image.putdata(new_image_data)

plt.figure(figsize=(10, 6))
plt.imshow(wordcloud_image, interpolation='bilinear')
plt.axis('off')
plt.title('Vulnerable Rural Health Deficit Cluster Country Members')
plt.show()


udbrr_cluster = cluster[cluster['cluster'] == 'Urban Diabetes Burden & Residential Refuges'][['country_name']]
udbrr_cluster.columns = ['Urban Diabetes Burden and Residential Refuges']


country_names = udbrr_cluster['Urban Diabetes Burden and Residential Refuges']

# Preprocess the text to replace spaces in multi-word country names with underscores
country_names = country_names.str.replace(' ', '_')

# WordCloud customization
wordcloud = WordCloud(
    width=800,
    height=400,
    background_color='white',
    colormap='viridis',
    contour_color='steelblue',
    contour_width=1,
    max_font_size=25,
    min_word_length=3,
    prefer_horizontal=1.0,
    random_state=42
).generate(' '.join(country_names))

# Preprocess the word cloud image to display underscores as spaces
wordcloud_image = wordcloud.to_image().convert('RGBA')
wordcloud_image_data = wordcloud_image.getdata()
new_image_data = [(r, g, b, a) if r != 255 and g != 255 and b != 255 else (255, 255, 255, 0) for r, g, b, a in wordcloud_image_data]
wordcloud_image.putdata(new_image_data)

plt.figure(figsize=(10, 6))
plt.imshow(wordcloud_image, interpolation='bilinear')
plt.axis('off')
plt.title('Urban Diabetes Burden and Residential Refuges')
plt.show()


ruhc_cluster = cluster[cluster['cluster'] == 'Resilient Urban Health Capitals'][['country_name']]
ruhc_cluster.columns = ['Resilient Urban Health Capitals']


country_names = ruhc_cluster['Resilient Urban Health Capitals']

# Preprocess the text to replace spaces in multi-word country names with underscores
country_names = country_names.str.replace(' ', '_')

# WordCloud customization
wordcloud = WordCloud(
    width=800,
    height=400,
    background_color='white',
    colormap='viridis',
    contour_color='steelblue',
    contour_width=1,
    max_font_size=25,
    min_word_length=3,
    prefer_horizontal=1.0,
    random_state=42
).generate(' '.join(country_names))

# Preprocess the word cloud image to display underscores as spaces
wordcloud_image = wordcloud.to_image().convert('RGBA')
wordcloud_image_data = wordcloud_image.getdata()
new_image_data = [(r, g, b, a) if r != 255 and g != 255 and b != 255 else (255, 255, 255, 0) for r, g, b, a in wordcloud_image_data]
wordcloud_image.putdata(new_image_data)

plt.figure(figsize=(10, 6))
plt.imshow(wordcloud_image, interpolation='bilinear')
plt.axis('off')
plt.title('Resilient Urban Health Capitals')
plt.show()

	cumulative_confirmed	cumulative_deceased	cumulative_tested	population_rural	population_urban	population_density	human_development_index	gdp_per_capita_usd	human_capital_index	area_sq_km	life_expectancy	smoking_prevalence	diabetes_prevalence	pollution_mortality_rate	comorbidity_mortality_rate	nurses_per_1000	physicians_per_1000	health_expenditure_usd	stringency_index	mobility_retail_and_recreation	mobility_grocery_and_pharmacy	mobility_parks	mobility_transit_stations	mobility_workplaces	mobility_residential	average_temperature_celsius	relative_humidity
0	-0.37	-0.84	4.07	-1.01	0.93	-0.25	0.76	1.13	0.37	-0.37	0.47	0.83	2.16	-0.28	-0.04	0.09	0.24	-0.06	0.35	0.07	0.10	-1.08	-0.56	0.22	0.49	1.24	-1.27
1	0.49	1.03	-0.42	-1.24	1.21	-0.57	0.50	-0.42	0.04	0.71	0.28	0.07	-0.42	-0.69	-0.22	-0.60	1.19	-0.08	1.39	-1.41	-0.52	-1.67	-0.54	0.59	0.45	-0.10	-0.77
2	1.66	0.46	5.47	0.31	-0.27	-0.28	1.07	1.52	1.30	-0.37	1.04	0.91	-0.24	-0.85	-1.05	0.40	1.95	1.72	-0.40	-1.08	-0.75	0.48	-0.52	-0.96	-0.20	-1.17	0.70
3	0.01	-0.85	0.21	-0.98	0.93	-0.61	1.28	1.68	1.37	2.68	1.17	-0.69	-0.49	-0.95	-1.48	1.61	0.98	1.92	-0.82	-0.07	-0.38	-0.69	-1.41	-0.09	0.14	0.22	-2.01
4	-0.18	2.38	-0.47	0.78	-0.70	-0.42	-0.13	-0.60	0.09	-0.38	0.39	1.91	0.35	0.08	0.15	0.09	0.00	-0.51	0.64	0.44	0.52	0.10	0.33	0.42	-2.49	-0.89	0.28

	cumulative_confirmed	cumulative_deceased	cumulative_tested	population_rural	population_urban	population_density	human_development_index	gdp_per_capita_usd	human_capital_index	area_sq_km	life_expectancy	smoking_prevalence	diabetes_prevalence	pollution_mortality_rate	comorbidity_mortality_rate	nurses_per_1000	physicians_per_1000	health_expenditure_usd	stringency_index	mobility_retail_and_recreation	mobility_grocery_and_pharmacy	mobility_parks	mobility_transit_stations	mobility_workplaces	mobility_residential	average_temperature_celsius	relative_humidity
	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	count
Cluster
0	0.80	0.56	0.52	-0.50	0.52	-0.08	0.84	0.77	0.99	0.15	0.78	0.71	-0.26	-0.71	-0.59	0.96	0.90	0.81	-0.31	-0.25	-0.29	0.72	-0.36	-0.50	-0.27	-0.98	0.45	37
1	-0.87	-0.87	-0.55	1.09	-1.13	-0.03	-1.42	-0.80	-1.28	-0.18	-1.42	-0.77	-0.52	1.37	0.82	-0.85	-1.13	-0.70	-0.29	1.32	1.06	0.19	1.33	1.26	-0.33	0.64	-0.47	22
2	-0.32	-0.05	-0.21	-0.16	0.18	0.11	0.00	-0.33	-0.26	-0.05	0.07	-0.28	0.64	-0.12	0.11	-0.51	-0.26	-0.44	0.54	-0.61	-0.39	-0.94	-0.49	-0.28	0.52	0.67	-0.19	33

	cumulative_confirmed	cumulative_deceased	cumulative_tested	population_rural	population_urban	population_density	human_development_index	gdp_per_capita_usd	human_capital_index	area_sq_km	life_expectancy	smoking_prevalence	diabetes_prevalence	pollution_mortality_rate	comorbidity_mortality_rate	nurses_per_1000	physicians_per_1000	health_expenditure_usd	mobility_retail_and_recreation	mobility_grocery_and_pharmacy	mobility_parks	mobility_transit_stations	mobility_workplaces	mobility_residential	average_temperature_celsius	relative_humidity
	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	count
Cluster
0	0.72	-0.02	0.70	-0.86	0.86	0.16	1.08	1.46	1.20	0.20	1.07	0.19	-0.22	-0.89	-1.24	1.30	0.88	1.60	-0.27	-0.43	0.71	-0.65	-0.64	0.06	-0.86	0.41	21
1	-0.87	-0.87	-0.55	1.09	-1.13	-0.03	-1.42	-0.80	-1.28	-0.18	-1.42	-0.77	-0.52	1.37	0.82	-0.85	-1.13	-0.70	1.32	1.06	0.19	1.33	1.26	-0.33	0.64	-0.47	22
2	0.94	1.37	0.34	0.03	-0.02	-0.38	0.53	-0.13	0.74	0.09	0.37	1.36	-0.35	-0.46	0.34	0.43	0.99	-0.24	-0.14	-0.03	0.89	0.07	-0.32	-0.88	-1.18	0.55	15
3	-0.30	-0.03	-0.23	-0.19	0.21	0.09	0.02	-0.32	-0.24	-0.05	0.09	-0.22	0.63	-0.13	0.08	-0.44	-0.25	-0.42	-0.63	-0.41	-0.95	-0.49	-0.28	0.56	0.64	-0.19	34

	cumulative_confirmed	cumulative_deceased	cumulative_tested	population_rural	population_urban	population_density	human_development_index	gdp_per_capita_usd	human_capital_index	area_sq_km	life_expectancy	smoking_prevalence	diabetes_prevalence	pollution_mortality_rate	comorbidity_mortality_rate	nurses_per_1000	physicians_per_1000	health_expenditure_usd	mobility_retail_and_recreation	mobility_grocery_and_pharmacy	mobility_parks	mobility_transit_stations	mobility_workplaces	mobility_residential	average_temperature_celsius	relative_humidity
	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	mean	count
Cluster
0	0.98	1.31	0.35	0.03	-0.02	-0.37	0.53	-0.11	0.76	0.06	0.41	1.28	-0.29	-0.49	0.25	0.42	1.04	-0.21	-0.16	-0.03	0.83	0.01	-0.35	-0.78	-1.12	0.54	16
1	-0.85	-0.68	-0.52	1.28	-1.24	0.21	-0.87	-0.77	-0.84	-0.12	-0.79	-0.14	-0.12	0.99	1.14	-0.77	-1.02	-0.68	-0.45	-0.56	-0.64	-0.46	-0.38	1.00	0.74	0.41	12
2	0.67	-0.05	0.70	-0.90	0.90	0.18	1.11	1.52	1.20	0.22	1.08	0.19	-0.26	-0.89	-1.24	1.35	0.82	1.67	-0.26	-0.45	0.74	-0.64	-0.64	0.03	-0.89	0.41	20
3	-0.85	-0.87	-0.54	0.98	-1.03	-0.14	-1.40	-0.79	-1.29	-0.14	-1.48	-0.78	-0.52	1.39	0.79	-0.86	-1.11	-0.70	1.55	1.40	0.28	1.58	1.51	-0.57	0.65	-0.71	18
4	-0.14	0.14	-0.14	-0.59	0.60	0.09	0.19	-0.20	-0.12	-0.06	0.30	-0.33	0.80	-0.43	-0.27	-0.35	-0.04	-0.35	-0.57	-0.35	-0.98	-0.40	-0.16	0.39	0.58	-0.35	26

COVID-19 Country Clustering Analysis¶

Importing Relevant Library¶

Importing and Merging Datasets for Analysis¶

0. Preparing Dataset Identifier¶

1. Importing and Preparing epidemiology dataset¶

2. Merging demographics dataset¶

3. Merging economy dataset and focusing on variables of interest¶

4. Merging geography dataset and focusing on variables of interest¶

5. Merging health dataset and focusing on variables of interest¶

6. Importing hospitalizations dataset and focusing on variables of interest¶

7. Importing government response dataset for country specific intervention analysis¶

8. Importing mobility dataset and focusing on variables of interest¶

9. Importing weather dataset and focusing on variables of interest**¶

10. Normalizing Population Based Data to Total Population¶

11. Dropping Redundant Variables¶

11. Deleting original dataset to free up memory space¶

Building k-means Country Clustering Model¶

1. Preparing the Dataset for Clustering Analysis¶

2. Constructing an Elbow Chart¶

3. Checking for k=3 to k=4 cluster characteristics¶

Naming the Clusters¶

Cluster 0: "Vulnerable Rural Health Deficit"¶

Cluster 1: "Urban Diabetes Burden & Residential Refuges"¶

Cluster 2: "Resilient Urban Health Capitals"¶

Visualizing the Clusters¶

1. Preparing Clustered Datasets for Visualization¶

2. Interactive Scatterplot Between Features¶

3. Cluster Map Distribution¶

4. Cluster Country Members: Vulnerable Rural Health Deficit¶

5. Cluster Country Members: Urban Diabetes Burden and Residential Refuges¶

6. Cluster Country Members: Resilient Urban Health Capitals¶

End of Analysis¶