from ddf_utils.factory.ihme import IHMELoader
import pathlib
import zipfile
import shutil

# make output directory
outdir = pathlib.Path('data')
outdir.mkdir(exist_ok=True)

# get loader and metadata
GBD = IHMELoader()
metadata = GBD.load_metadata()

# choose version: latest
version = metadata['version']['id'].max()

# choose locations: all but 'custom'
locations_md = metadata['location']
locations = locations_md[locations_md['id'] != 'custom']['id'].tolist()

# download cause data
zippath = GBD.bulk_download(
    version = version, # which version of the data to use
    out_dir = outdir, # where to save the data
    context = 'cause', # GBD nomenclature for 'diseases'
    age = [2,3,4,5], # early neonatal, late neonatal, postneonatal, 1-4 years
    location = locations, # all locations
    sex = 3, # both male and female
    year = 2011, # year
    metric = 1, # number of cases
    measure = [2,3,4,5], # daly, yld, yll, incidence
    cause = [302, 322], # diarrhea, lower respiratory infections
    idsOrNames = 'both', # include plain text descriptions of the codes in the results file
)

working on https://s3.healthdata.org/gbd-api-2019-public/be12c64a804d524dcf5a4101782dfe22
check status as http://ghdx.healthdata.org/gbd-results-tool/result/be12c64a804d524dcf5a4101782dfe22
available downloads:
https://s3.healthdata.org:443/gbd-api-2019-public/be12c64a804d524dcf5a4101782dfe22_files/IHME-GBD_2019_DATA-be12c64a-1.zip
downloading https://s3.healthdata.org:443/gbd-api-2019-public/be12c64a804d524dcf5a4101782dfe22_files/IHME-GBD_2019_DATA-be12c64a-1.zip to data\be12c64a\IHME-GBD_2019_DATA-be12c64a-1.zip
begin downloading data\be12c64a\IHME-GBD_2019_DATA-be12c64a-1.zip...

816kB [00:01, 530kB/s]


# unzip the downloaded file
fullzippath = outdir / zippath[0]
with zipfile.ZipFile(next(fullzippath.glob('*.zip')), 'r') as f:
    files = f.namelist()
    f.extractall(outdir)
    
# delete zip archive and parent folder
shutil.rmtree(fullzippath)

# fetch extracted file name
cause_file = outdir / [f for f in files if '.csv' in f][0]
cause_file

WindowsPath('data/IHME-GBD_2019_DATA-be12c64a-1.csv')


from ddf_utils.factory.ihme import IHMELoader
from ddf_utils.chef.helpers import read_opt

def _corrected_make_query(self, context, version, **kwargs):
        """generate a query with the context, version and all keyword arguments.

        if a parameter is mandatory but not provided, it will fill
        with default values.

        """
        # metadata
        if not self.metadata:
            self.load_metadata()

        metadata = self.metadata
        ages = metadata['age']['id'].values
        # location: there is a `custom` location. don't include that one.
        locations_md = metadata['location']
        locations = locations_md[locations_md['id'] != 'custom']['id'].tolist()
        sexs = metadata['sex']['id'].tolist()
        years = metadata['year']['id'].tolist()
        metrics = metadata['metric']['id'].tolist()
        measures = metadata['measure']['id'].tolist()
        causes = metadata['cause']['id'].tolist()
        # risk/etiology/impairment
        # There are actually 4 types data in this dictionary:
        # risk, etiology, impairment and injury n-codes.
        # however injury n-codes is not enabled.
        # we might need to take of it later.
        rei = metadata['rei']
        # risks = rei[rei['type'] == 'risk']['rei_id'].values
        # etiologys = rei[rei['type'] == 'etiology']['rei_id'].values
        # impairments = rei[rei['type'] == 'impairment']['rei_id'].values

        # others metadata filed not include:
        # - groups
        # - year_range

        queries = {}

        # create query base on context and user input
        if context == 'le':
            measure = read_opt(kwargs, 'measure', default=26)
            metric = read_opt(kwargs, 'metric', default=5)
            cause = read_opt(kwargs, 'cause', default=causes)
            queries.update({
                'measure[]': measure,
                'metric[]': metric,
                'cause[]': cause
            })
        elif context == 'cause':
            measure = read_opt(kwargs, 'measure', default=measures)
            metric = read_opt(kwargs, 'metric', default=[1, 2, 3])
            cause = read_opt(kwargs, 'cause', default=causes)
            queries.update({
                'measure[]': measure,
                'metric[]': metric,
                'cause[]': cause
            })
        elif context in ['risk', 'etiology', 'impairment']:
            # TODO: should be split, don't combine these context here.
            measure = read_opt(kwargs, 'measure', default=measures)
            metric = read_opt(kwargs, 'metric', default=[1, 2, 3])
            context_values = rei[rei['type'] == context]['id'].tolist() ################
            context_values = read_opt(kwargs, 'rei_id', default=[1, 2, 3]) ##################
            cause = read_opt(kwargs, 'cause', default=causes)
            queries.update({
                'measure[]': measure,
                'metric[]': metric,
                context+'[]':context_values,
                'rei[]': context_values,
                'cause[]':  cause
            })
        else:
            # SEV/HALE/haqi
            print('not supported context.')
            raise NotImplementedError

        # insert context and version and other configs
        rows = read_opt(kwargs, 'rows', default=10000000)  # the maximum records we can get
        # ^ Note: user guide[1] says it's 500000 row. But actually we can set this to 10000000
        # [1]: http://www.healthdata.org/sites/default/files/files/Data_viz/GBD_2017_Tools_Overview.pdf
        email = read_opt(kwargs, 'email', default='downloader@gapminder.org')
        idsOrNames = read_opt(kwargs, 'idsOrNames', default='ids')           # ids / names / both
        singleOrMult = read_opt(kwargs, 'singleOrMult', default='multiple')  # single / multiple
        base = read_opt(kwargs, 'base', default='single')

        location = read_opt(kwargs, 'location', default=locations)
        age = read_opt(kwargs, 'age', default=ages)
        sex = read_opt(kwargs, 'sex', default=sexs)
        year = read_opt(kwargs, 'year', default=years)

        queries.setdefault('context', context)
        queries.setdefault('version', version)
        queries.setdefault('rows', rows)
        queries.setdefault('email', email)
        queries.setdefault('idsOrNames', idsOrNames)
        queries.setdefault('singleOrMult', singleOrMult)
        queries.setdefault('base', base)
        queries.setdefault('location[]', location)
        queries.setdefault('age[]', age)
        queries.setdefault('sex[]', sex)
        queries.setdefault('year[]', year)

        return queries
    
IHMELoader._make_query = _corrected_make_query


# download risk-specific data
zippath = GBD.bulk_download(
    version = version, # which version of the data to use
    out_dir = outdir, # where to save the data
    context = 'risk', # GBD nomenclature for 'diseases'
    age = [2,3,4,5], # early neonatal, late neonatal, postneonatal, 1-4 years
    location = locations, # all locations
    sex = 3, # both male and female
    year = 2011, # year
    metric = 1, # number of cases
    measure = [2,3,4], # daly, yld, yll (no prevalence available)
    cause = [302, 322], # diarrhea, lower respiratory infections
    rei_id = [136, 137], # non-exclusive breastfeeding, discontinued breastfeeding
    idsOrNames = 'both', # include plain text descriptions of the codes in the results file
)

working on https://s3.healthdata.org/gbd-api-2019-public/659c2316c834cfb8244c8cefdc4628bb
check status as http://ghdx.healthdata.org/gbd-results-tool/result/659c2316c834cfb8244c8cefdc4628bb
available downloads:
https://s3.healthdata.org:443/gbd-api-2019-public/659c2316c834cfb8244c8cefdc4628bb_files/IHME-GBD_2019_DATA-659c2316-1.zip
downloading https://s3.healthdata.org:443/gbd-api-2019-public/659c2316c834cfb8244c8cefdc4628bb_files/IHME-GBD_2019_DATA-659c2316-1.zip to data\659c2316\IHME-GBD_2019_DATA-659c2316-1.zip
begin downloading data\659c2316\IHME-GBD_2019_DATA-659c2316-1.zip...

478kB [00:01, 464kB/s]


# unzip the downloaded file
fullzippath = outdir / zippath[0]
with zipfile.ZipFile(next(fullzippath.glob('*.zip')), 'r') as f:
    files = f.namelist()
    f.extractall(outdir)
    
# delete zip archive and parent folder
shutil.rmtree(fullzippath)

# fetch extracted file name
risk_file = outdir / [f for f in files if '.csv' in f][0]
risk_file

WindowsPath('data/IHME-GBD_2019_DATA-659c2316-1.csv')


# load cause data
import pandas as pd

df_cause = pd.read_csv(cause_file)
df_cause


drop_cols = ['measure_id', 'sex_id', 'age_id', 'cause_id', 'metric_id', 'metric_name']
rename_cols = {
    'measure_name': 'measure',
    'location_name': 'location',
    'sex_name':'sex',
    'age_name': 'ages',
    'cause_name': 'cause',
    'rei_name': 'risk'
}
rename_measures={
    'YLLs (Years of Life Lost)':'YLL',
    'YLDs (Years Lived with Disability)': 'YLD',
    'DALYs (Disability-Adjusted Life Years)': 'DALY',
}
df_cause = df_cause.drop(columns=drop_cols).rename(columns=rename_cols)
df_cause['measure'].replace(rename_measures, inplace=True)

df_cause


df_cause = df_cause.melt(
        id_vars=['measure', 'location_id', 'location', 'sex', 'ages', 'cause', 'year'],
        var_name='estimate',
    )

df_cause


df_prevalence = df_cause.query("measure=='Prevalence'")
df_prevalence


df_cause = df_cause.query('measure!="Prevalence"')
df_cause


# load raw data
df_risk = pd.read_csv(risk_file)
df_risk


# rename columns
df_risk['measure_name'].replace(rename_measures, inplace=True)

# melt estimates
df_risk = df_risk.\
    drop(columns=drop_cols+['rei_id']).\
    rename(columns=rename_cols).\
    melt(
        id_vars=['measure', 'location_id', 'location', 'sex', 'ages', 'cause', 'year', 'risk'],
        var_name='estimate'
    )

df_risk


df_risk_cause = df_risk.merge(
    df_cause, 
    how='outer', 
    on=['location_id','location','year','sex','ages','cause','measure'], 
    suffixes=('_risk','_cause')
)

df_risk_cause


df_risk_cause = df_risk_cause.\
    rename(columns={'measure':'measure_risk_cause'}).\
    merge(
        df_prevalence.drop(columns='measure'),
        how='outer', 
        on=['location_id','location','year','sex','ages','cause'],
    ).\
    rename(
        columns={'estimate':'estimate_prevalence', 'value':'value_prevalence'}
    )

df_risk_cause


df_risk_cause['prevalence_risk'] = df_risk_cause['value_prevalence'] * df_risk_cause['value_risk'] / df_risk_cause['value_cause']

df_risk_cause


query = "location=='Global' & cause=='Diarrheal diseases' & ages=='Post Neonatal'" +\
        "& estimate_cause=='val' & estimate_risk=='val' & estimate_prevalence=='val'" +\
        "& risk=='Discontinued breastfeeding'"
df_risk_cause.query(query)[['measure_risk_cause','prevalence_risk']].values

array([['YLL', 73097.06555306481],
       ['YLD', 125700.98773238061],
       ['DALY', 73856.51388181357]], dtype=object)


import matplotlib.pyplot as plt
import numpy as np

mean_var = {}
for label, col in zip(['risk', 'cause'],['prevalence_risk', 'value_prevalence']):
    normalized = np.log10(df_risk_cause[col])
    mean_var[label] = {'mean':normalized.mean(), 'var':normalized.var()}
    normalized.hist(alpha=0.5, bins=50, label=label)
    
plt.legend()
mean_var

{'risk': {'mean': 1.5991704670066902, 'var': 2.0670509112758144},
 'cause': {'mean': 2.3306671330733675, 'var': 2.3390394466165016}}


df_pivot = df_risk_cause.pivot(
    index=['location_id','location','year','sex','ages','cause','risk','estimate_cause','estimate_risk','estimate_prevalence'],
    columns='measure_risk_cause',
    values='prevalence_risk',
)

mean_var = {}
for measure in ['YLD', 'YLL', 'DALY']:
    normalized = np.log10(df_pivot[measure])
    mean_var[measure] = {'mean':normalized.mean(), 'var':normalized.var()}
    normalized.hist(alpha=0.5, bins=50, label=measure)
    
plt.legend()
mean_var

{'YLD': {'mean': 1.6002616178508797, 'var': 2.0723871034005734},
 'YLL': {'mean': 1.5989314839306552, 'var': 2.066927884114937},
 'DALY': {'mean': 1.598318299238536, 'var': 2.061873578870419}}


rel_diff = {}
descriptions = {}
for pair in [('YLD','YLL'), ('YLD','DALY'), ('YLL','DALY')]:
    label = f"({pair[0]}-{pair[1]})/{pair[0]}"
    rel_diff[label] = (df_pivot[pair[0]]-df_pivot[pair[1]])/df_pivot[pair[1]]
    rel_diff[label].hist(alpha=0.5, bins=50, label=label, figsize=(10,5))
    descriptions[label] = rel_diff[label].describe()

plt.legend()
descriptions

{'(YLD-YLL)/YLD': count    109350.000000
 mean          0.026372
 std           0.255749
 min          -0.881404
 25%          -0.066886
 50%           0.000606
 75%           0.075576
 max           7.424321
 dtype: float64,
 '(YLD-DALY)/YLD': count    109350.000000
 mean          0.024057
 std           0.229502
 min          -0.877975
 25%          -0.064596
 50%           0.000501
 75%           0.077823
 max           7.061630
 dtype: float64,
 '(YLL-DALY)/YLL': count    109350.000000
 mean          0.004535
 std           0.082954
 min          -0.679156
 25%          -0.007031
 50%          -0.000007
 75%           0.008113
 max           2.128778
 dtype: float64}


df_risk_cause_YLL = df_risk_cause.query('measure_risk_cause=="YLL"').copy()
df_risk_cause_YLL['exposure'] = df_risk_cause_YLL.\
    groupby(['location_id','location','year','sex','ages','risk','estimate_cause','estimate_risk','estimate_prevalence'])['prevalence_risk'].\
    transform(sum)

df_risk_cause_YLL


grouped = df_risk_cause_YLL.groupby(['location_id','location','year','sex','ages','risk'])['exposure']
df_exposure = pd.DataFrame([
    grouped.min().rename('exposure min'), 
    grouped.mean().rename('exposure mean'), 
    grouped.max().rename('exposure max')
]).T

df_exposure

	measure_id	measure_name	location_id	location_name	sex_id	sex_name	age_id	age_name	cause_id	cause_name	metric_id	metric_name	year	val	upper	lower
0	4	YLLs (Years of Life Lost)	67	Japan	3	Both	2	Early Neonatal	302	Diarrheal diseases	1	Number	2011	69.033674	76.183135	58.028415
1	4	YLLs (Years of Life Lost)	67	Japan	3	Both	3	Late Neonatal	302	Diarrheal diseases	1	Number	2011	180.218561	222.943105	136.541365
2	4	YLLs (Years of Life Lost)	67	Japan	3	Both	4	Post Neonatal	302	Diarrheal diseases	1	Number	2011	1816.582776	2020.018154	1601.250103
3	4	YLLs (Years of Life Lost)	67	Japan	3	Both	5	1 to 4	302	Diarrheal diseases	1	Number	2011	1985.316770	2230.198753	1747.728976
4	4	YLLs (Years of Life Lost)	67	Japan	3	Both	2	Early Neonatal	322	Lower respiratory infections	1	Number	2011	531.197510	587.936649	448.027001
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
21595	5	Prevalence	35440	Ishikawa	3	Both	5	1 to 4	302	Diarrheal diseases	1	Number	2011	20.338392	28.227613	14.082740
21596	5	Prevalence	35440	Ishikawa	3	Both	2	Early Neonatal	322	Lower respiratory infections	1	Number	2011	1.201336	1.496299	0.949697
21597	5	Prevalence	35440	Ishikawa	3	Both	3	Late Neonatal	322	Lower respiratory infections	1	Number	2011	1.793935	2.122410	1.513424
21598	5	Prevalence	35440	Ishikawa	3	Both	4	Post Neonatal	322	Lower respiratory infections	1	Number	2011	15.925677	18.657651	13.490323
21599	5	Prevalence	35440	Ishikawa	3	Both	5	1 to 4	322	Lower respiratory infections	1	Number	2011	48.524187	62.802658	36.607084

	measure	location_id	location	sex	ages	cause	year	val	upper	lower
0	YLL	67	Japan	Both	Early Neonatal	Diarrheal diseases	2011	69.033674	76.183135	58.028415
1	YLL	67	Japan	Both	Late Neonatal	Diarrheal diseases	2011	180.218561	222.943105	136.541365
2	YLL	67	Japan	Both	Post Neonatal	Diarrheal diseases	2011	1816.582776	2020.018154	1601.250103
3	YLL	67	Japan	Both	1 to 4	Diarrheal diseases	2011	1985.316770	2230.198753	1747.728976
4	YLL	67	Japan	Both	Early Neonatal	Lower respiratory infections	2011	531.197510	587.936649	448.027001
...	...	...	...	...	...	...	...	...	...	...
21595	Prevalence	35440	Ishikawa	Both	1 to 4	Diarrheal diseases	2011	20.338392	28.227613	14.082740
21596	Prevalence	35440	Ishikawa	Both	Early Neonatal	Lower respiratory infections	2011	1.201336	1.496299	0.949697
21597	Prevalence	35440	Ishikawa	Both	Late Neonatal	Lower respiratory infections	2011	1.793935	2.122410	1.513424
21598	Prevalence	35440	Ishikawa	Both	Post Neonatal	Lower respiratory infections	2011	15.925677	18.657651	13.490323
21599	Prevalence	35440	Ishikawa	Both	1 to 4	Lower respiratory infections	2011	48.524187	62.802658	36.607084

	measure	location_id	location	sex	ages	cause	year	estimate	value
16200	Prevalence	110	Dominica	Both	Early Neonatal	Diarrheal diseases	2011	val	0.367450
16201	Prevalence	110	Dominica	Both	Late Neonatal	Diarrheal diseases	2011	val	2.357251
16202	Prevalence	110	Dominica	Both	Post Neonatal	Diarrheal diseases	2011	val	34.338314
16203	Prevalence	110	Dominica	Both	1 to 4	Diarrheal diseases	2011	val	69.949927
16204	Prevalence	111	Dominican Republic	Both	Early Neonatal	Lower respiratory infections	2011	val	78.189441
...	...	...	...	...	...	...	...	...	...
64795	Prevalence	35440	Ishikawa	Both	1 to 4	Diarrheal diseases	2011	lower	14.082740
64796	Prevalence	35440	Ishikawa	Both	Early Neonatal	Lower respiratory infections	2011	lower	0.949697
64797	Prevalence	35440	Ishikawa	Both	Late Neonatal	Lower respiratory infections	2011	lower	1.513424
64798	Prevalence	35440	Ishikawa	Both	Post Neonatal	Lower respiratory infections	2011	lower	13.490323
64799	Prevalence	35440	Ishikawa	Both	1 to 4	Lower respiratory infections	2011	lower	36.607084

	measure	location_id	location	sex	ages	cause	year	risk	estimate_risk	value_risk	estimate_cause	value_cause
0	YLL	1	Global	Both	Late Neonatal	Diarrheal diseases	2011	Non-exclusive breastfeeding	val	2.213902e+06	val	4.677587e+06
1	YLL	1	Global	Both	Late Neonatal	Diarrheal diseases	2011	Non-exclusive breastfeeding	val	2.213902e+06	upper	5.694902e+06
2	YLL	1	Global	Both	Late Neonatal	Diarrheal diseases	2011	Non-exclusive breastfeeding	val	2.213902e+06	lower	3.793404e+06
3	YLL	1	Global	Both	Late Neonatal	Diarrheal diseases	2011	Non-exclusive breastfeeding	upper	2.849573e+06	val	4.677587e+06
4	YLL	1	Global	Both	Late Neonatal	Diarrheal diseases	2011	Non-exclusive breastfeeding	upper	2.849573e+06	upper	5.694902e+06
...	...	...	...	...	...	...	...	...	...	...	...	...
127570	YLD	4618	North East England	Both	Early Neonatal	Lower respiratory infections	2011	NaN	NaN	NaN	upper	7.599097e-01
127571	YLD	4618	North East England	Both	Early Neonatal	Lower respiratory infections	2011	NaN	NaN	NaN	lower	3.123462e-01
127572	YLD	4618	North East England	Both	1 to 4	Lower respiratory infections	2011	NaN	NaN	NaN	val	6.299188e+00
127573	YLD	4618	North East England	Both	1 to 4	Lower respiratory infections	2011	NaN	NaN	NaN	upper	9.874345e+00
127574	YLD	4618	North East England	Both	1 to 4	Lower respiratory infections	2011	NaN	NaN	NaN	lower	3.782328e+00

	measure_risk_cause	location_id	location	sex	ages	cause	year	risk	estimate_risk	value_risk	estimate_cause	value_cause	estimate_prevalence	value_prevalence
0	YLL	1	Global	Both	Late Neonatal	Diarrheal diseases	2011	Non-exclusive breastfeeding	val	2.213902e+06	val	4.677587e+06	val	280221.248215
1	YLL	1	Global	Both	Late Neonatal	Diarrheal diseases	2011	Non-exclusive breastfeeding	val	2.213902e+06	val	4.677587e+06	upper	304872.583614
2	YLL	1	Global	Both	Late Neonatal	Diarrheal diseases	2011	Non-exclusive breastfeeding	val	2.213902e+06	val	4.677587e+06	lower	255016.302789
3	YLL	1	Global	Both	Late Neonatal	Diarrheal diseases	2011	Non-exclusive breastfeeding	val	2.213902e+06	upper	5.694902e+06	val	280221.248215
4	YLL	1	Global	Both	Late Neonatal	Diarrheal diseases	2011	Non-exclusive breastfeeding	val	2.213902e+06	upper	5.694902e+06	upper	304872.583614
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
382720	YLD	44778	South Gloucestershire	Both	1 to 4	Lower respiratory infections	2011	NaN	NaN	NaN	upper	1.170756e+00	upper	14.100846
382721	YLD	44778	South Gloucestershire	Both	1 to 4	Lower respiratory infections	2011	NaN	NaN	NaN	upper	1.170756e+00	lower	8.028466
382722	YLD	44778	South Gloucestershire	Both	1 to 4	Lower respiratory infections	2011	NaN	NaN	NaN	lower	3.186603e-01	val	10.624555
382723	YLD	44778	South Gloucestershire	Both	1 to 4	Lower respiratory infections	2011	NaN	NaN	NaN	lower	3.186603e-01	upper	14.100846
382724	YLD	44778	South Gloucestershire	Both	1 to 4	Lower respiratory infections	2011	NaN	NaN	NaN	lower	3.186603e-01	lower	8.028466

Child exposure to diseases caused by suboptimal breastfeeding¶

Table of Contents

Theory¶

Data download¶

Cause data¶

Risk-specific data¶

Load and clean data¶

Cause data¶

Risk data¶

Calculating risk-specific prevalence¶

YLD vs. YLL vs. DALY¶

Calculating exposure¶