un_world_pop

This module contains methods for extracting data from UN World Population files.

These methods output dataframes that can be used to initialize Demographic objects in EMOD.

`extract_fertility(country, version, filename=None)`

This code is for extracting fertility rates for the given country from the fertility files downloaded from the UN World Pop. The code assumes that the files is in Strict Open XML Spreadsheet format. This format require us to use the 'calamine' engine to read the file. The file is expected to be the Age Specific rates (i.e. WPP2012_FERT_F07_AGE_SPECIFIC_FERTILITY, WPP2024_FERT_F02_FERTILITY_RATES_BY_5-YEAR_AGE_GROUPS_OF_MOTHER)

Parameters:

Name	Type	Description	Default
`country`	`str`	The name of the country used in the spreadsheet for which you want to extract the data.	required
`version`	`str`	A string with the year/version of the file. Supported versions are 2012, 2015, 2019, 2024	required
`filename`	`Union[str, Path]`	If not provided, the 'version' will be used to select from the known versions. If a filename is provided, it is assumed to be file from the UN World Pop website. It may have several sheets but the data will be extracted from the first two. The 'estimates' sheet gives you data for the past while the 'medium variant' sheet gives you the data for the future. It is expected to be in the Strict Open XML Spreadsheet format.	`None`

Returns:

Type	Description
`YearAgeRate`	A YearAgeRate object containing the fertility data in the given file.

Source code in emodpy_hiv/demographics/un_world_pop.py

def extract_fertility(country: str,
                      version: str,
                      filename: Union[str, Path] = None):
    """
    This code is for extracting fertility rates for the given country from the
    [fertility files](https://population.un.org/wpp/Download/Standard/Fertility/) downloaded from the UN World Pop.
    The code assumes that the files is in Strict Open XML Spreadsheet format.  This format require
    us to use the 'calamine' engine to read the file.  The file is expected to be the Age Specific
    rates (i.e. WPP2012_FERT_F07_AGE_SPECIFIC_FERTILITY, WPP2024_FERT_F02_FERTILITY_RATES_BY_5-YEAR_AGE_GROUPS_OF_MOTHER)

    Args:
        country (str):
            The name of the country used in the spreadsheet for which you want to extract the data.

        version (str):
            A string with the year/version of the file.  Supported versions are 2012, 2015, 2019, 2024

        filename (Union[str, Path]):
            If not provided, the 'version' will be used to select from the known versions. If a filename
            is provided, it is assumed to be file from the UN World Pop website. It may have several sheets
            but the data will be extracted from the first two. The 'estimates' sheet gives you data for the
            past while the 'medium variant' sheet gives you the data for the future. It is expected to be in
            the Strict Open XML Spreadsheet format.

    Returns:
        (YearAgeRate): A YearAgeRate object containing the fertility data in the given file.
    """
    if filename is None:
        filename = _get_fertility_filename(version)
    _check_filename(filename)

    # ---------------------------------------------------------
    # --- Define column names and sheets to read the data from.
    # ---------------------------------------------------------
    PERIOD_COL  = "Period" # noqa: E221
    AGE_15_19   = "15-19"  # noqa: E221
    AGE_20_24   = "20-24"  # noqa: E221
    AGE_25_29   = "25-29"  # noqa: E221
    AGE_30_34   = "30-34"  # noqa: E221
    AGE_35_39   = "35-39"  # noqa: E221
    AGE_40_44   = "40-44"  # noqa: E221
    AGE_45_49   = "45-49"  # noqa: E221

    age_cols = [AGE_15_19, AGE_20_24, AGE_25_29, AGE_30_34, AGE_35_39, AGE_40_44, AGE_45_49]

    if version == "2012":
        sheet_list = ["ESTIMATES", "MEDIUM FERTILITY"]
        COUNTRY_COL = "Major area, region, country or area *"
    elif version == "2015":
        sheet_list = ["ESTIMATES", "MEDIUM VARIANT"]
        COUNTRY_COL = "Major area, region, country or area *"
    elif version == "2019":
        sheet_list = ["ESTIMATES", "MEDIUM VARIANT"]
        COUNTRY_COL = "Region, subregion, country or area *"
    elif version == "2024":
        sheet_list = ["Estimates", "Medium variant"]
        COUNTRY_COL = "Region, subregion, country or area *"
        PERIOD_COL  = "Year" # noqa: E221
        AGE_10_14 = "10-14"
        AGE_50_54 = "50-54"
        age_cols.insert(0, AGE_10_14)
        age_cols.append(AGE_50_54)
    else:
        raise ValueError("version '{0}' is not supported.\nOnly 2012, 2015, 2019, 2024 formats are supported".format(version))

    cols_to_read = [COUNTRY_COL, PERIOD_COL]
    cols_to_read.extend(age_cols)

    # --------------------------------
    # --- Extract data from the sheets
    # --------------------------------
    df = pd.DataFrame()
    for sheet in sheet_list:
        df_sheet = pd.read_excel(filename,
                                 sheet_name=sheet,
                                 skiprows=16,
                                 usecols=cols_to_read,
                                 engine="calamine")
        _check_country(country, df_sheet[COUNTRY_COL].unique(), filename)
        df_sheet = df_sheet[df_sheet[COUNTRY_COL] == country]
        df = pd.concat([df, df_sheet])

    df = df.drop(COUNTRY_COL, axis=1)

    # ------------------------------------------------------------
    # --- Change the Period column to only contain min value/year
    # ------------------------------------------------------------
    if version == "2012" or version == "2015" or version == "2019":
        df[PERIOD_COL] = df[PERIOD_COL].str.slice_replace(4, 9, "").astype(float)

    # -----------------------------------------------------------
    # --- rename the age columns so they are just the minimum age
    # -----------------------------------------------------------
    new_col_names = {}
    for col in age_cols:
        new_col_names[col] = col[0:2]
    df.rename(columns=new_col_names, inplace=True)

    df = pd.melt(df, id_vars=[PERIOD_COL], var_name=YearAgeRate.COL_NAME_MIN_AGE, value_name=YearAgeRate.COL_NAME_RATE)

    # ----------------------------------------------------------
    # --- Convert dataframe into a YearAgeRate dataframe format
    # ----------------------------------------------------------
    df.rename({PERIOD_COL: YearAgeRate.COL_NAME_MIN_YEAR}, axis=1, inplace=True)
    df[YearAgeRate.COL_NAME_MIN_AGE ] = df[YearAgeRate.COL_NAME_MIN_AGE ].astype(float)    # noqa: E202
    df[YearAgeRate.COL_NAME_MIN_YEAR] = df[YearAgeRate.COL_NAME_MIN_YEAR].astype(float)
    df[YearAgeRate.COL_NAME_NODE_ID ] = 0                                                  # noqa: E202
    df = df.sort_values(by=[YearAgeRate.COL_NAME_MIN_YEAR, YearAgeRate.COL_NAME_MIN_AGE], ascending=True)

    df[YearAgeRate.COL_NAME_RATE] = df[YearAgeRate.COL_NAME_RATE].astype(float).round(1)
    df = df[YearAgeRate.COL_NAMES]

    return YearAgeRate(df=df)

`extract_mortality(country, version, gender=None, filename=None)`

This code is for extracting mortality rates for the given country from the mortality files downloaded from the UN World Pop. The code assumes that the files is in Strict Open XML Spreadsheet format. This format require us to use the 'calamine' engine to read the file. The file is expected to be for one gender and to be the Abridged Life Table.

Parameters:

Name	Type	Description	Default
`country`	`str`	The name of the country used in the spreadsheet for which you want to extract the data.	required
`version`	`str`	A string with the year/version of the file. Supported versions are 2012, 2015, 2019, 2024	required
`gender`	`str`	The gender of the data to be extracted. Possible values are 'male' and 'female'. Required if the filename is not provided.	`None`
`filename`	`Union[str, Path]`	If not provided, the 'version' will be used to select from the known versions. If a filename is provided, it is assumed to be file from the UN World Pop website. It may have several sheets but the data will be extracted from the first one plus the next one or two. The 'estimates' sheet gives you data for the past while the 'medium XXX' sheets give you the data for the future. It is expected to be in the Strict Open XML Spreadsheet format.	`None`

Returns:

Type	Description
`YearAgeRate`	A YearAgeRate object containing the mortality data in the given file.

Source code in emodpy_hiv/demographics/un_world_pop.py

def extract_mortality(country: str,
                      version: str,
                      gender: str = None,
                      filename: Union[str, Path] = None):
    """
    This code is for extracting mortality rates for the given country from the
    [mortality files](https://population.un.org/wpp/Download/Standard/Mortality/) downloaded from the UN World Pop.
    The code assumes that the files is in Strict Open XML Spreadsheet format.  This format require
    us to use the 'calamine' engine to read the file.  The file is expected to be for one gender
    and to be the Abridged Life Table.

    Args:
        country (str):
            The name of the country used in the spreadsheet for which you want to extract the data.

        version (str):
            A string with the year/version of the file.  Supported versions are 2012, 2015, 2019, 2024

        gender (str):
            The gender of the data to be extracted.  Possible values are 'male' and 'female'.
            Required if the filename is not provided.

        filename (Union[str, Path]):
            If not provided, the 'version' will be used to select from the known versions.  If a filename
            is provided, it is assumed to be file from the UN World Pop website.  It may have several sheets
            but the data will be extracted from the first one plus the next one or two.  The 'estimates'
            sheet gives you data for the past while the 'medium XXX' sheets give you the data for the future.
            It is expected to be in the Strict Open XML Spreadsheet format.

    Returns:
        (YearAgeRate): A YearAgeRate object containing the mortality data in the given file.
    """
    if filename is None:
        filename = _get_mortality_filename(version, gender)
    _check_filename(filename)

    # ---------------------------------------------------------
    # --- Define column names and sheets to read the data from.
    # ---------------------------------------------------------
    COUNTRY_COL      = "Region"                     # noqa: E221
    PERIOD_COL       = "Period"                     # noqa: E221
    AGE_COL          = "Age (x)"                    # noqa: E221
    AGE_INTERVAL_COL = "Age interval (n)"
    DEATH_RATE_COL   = "Central death rate m(x,n)"  # noqa: E221

    sheet_list = []
    if version == "2012":
        COUNTRY_COL = "Major area, region, country or area *"
        sheet_list = ["ESTIMATES", "MEDIUM_2010-2050", "MEDIUM_2050-2100"]
    elif version == "2015":
        COUNTRY_COL = "Major area, region, country or area *"
        sheet_list = ["ESTIMATES", "MEDIUM 2015-2050", "MEDIUM 2050-2100"]
    elif version == "2019":
        COUNTRY_COL = "Region, subregion, country or area *"
        sheet_list = ["ESTIMATES", "MEDIUM 2020-2050", "MEDIUM 2050-2100"]
    elif version == "2024":
        COUNTRY_COL = "Region, subregion, country or area *"
        PERIOD_COL  = "Year" # noqa: E221
        sheet_list = ["Estimates", "Medium variant"]
    else:
        raise ValueError("version '{0}' is not supported.\nOnly 2012, 2015, 2019, 2024 formats are supported".format(version))

    cols_to_read = [COUNTRY_COL, PERIOD_COL, AGE_COL, AGE_INTERVAL_COL, DEATH_RATE_COL]

    col_rename_dict = {
        PERIOD_COL    : YearAgeRate.COL_NAME_MIN_YEAR,  # noqa: E203
        AGE_COL       : YearAgeRate.COL_NAME_MIN_AGE,   # noqa: E203
        DEATH_RATE_COL: YearAgeRate.COL_NAME_RATE
    }

    # --------------------------------
    # --- Extract data from the sheets
    # --------------------------------
    df = pd.DataFrame()
    for sheet in sheet_list:
        df_sheet = pd.read_excel(filename,
                                 sheet_name=sheet,
                                 skiprows=16,
                                 usecols=cols_to_read,
                                 engine="calamine")
        _check_country(country, df_sheet[COUNTRY_COL].unique(), filename)
        df_sheet = df_sheet[df_sheet[COUNTRY_COL] == country]
        df = pd.concat([df, df_sheet])

    # ------------------------------------------------------------
    # --- Change the Period column to only contain min value/year
    # ------------------------------------------------------------
    if version == "2012" or version == "2015" or version == "2019":
        df[PERIOD_COL] = df[PERIOD_COL].str.slice_replace(4, 9, "").astype(float)

    # ----------------------------------------------------------
    # --- Convert dataframe into a YearAgeRate dataframe format
    # ----------------------------------------------------------
    df.rename(col_rename_dict, axis=1, inplace=True)

    df = df.drop([COUNTRY_COL, AGE_INTERVAL_COL], axis=1)

    df[YearAgeRate.COL_NAME_NODE_ID ] = 0                                                 # noqa: E202
    df[YearAgeRate.COL_NAME_MIN_YEAR] = df[YearAgeRate.COL_NAME_MIN_YEAR].astype(float)
    df[YearAgeRate.COL_NAME_MIN_AGE ] = df[YearAgeRate.COL_NAME_MIN_AGE ].astype(float)   # noqa: E202
    df[YearAgeRate.COL_NAME_RATE    ] = df[YearAgeRate.COL_NAME_RATE    ].round(8)        # noqa: E202
    df = df[YearAgeRate.COL_NAMES]

    return YearAgeRate(df=df)

`extract_population_by_age(country, version, years, filename=None)`

This code is for extracting population by age data from the files downloaded from the UN World Pop. The code assumes that the files is in Strict Open XML Spreadsheet format. This format require us to use the 'calamine' engine to read the file. The file is expected to be for both sexes and to have 5 year age bins. (i.e. WPP2019_POP_F15_1_ANNUAL_POPULATION_BY_AGE_BOTH_SEXES.xlsx, WPP2024_POP_F02_1_POPULATION_5-YEAR_AGE_GROUPS_BOTH_SEXES.xlsx)

Parameters:

Name	Type	Description	Default
`country`	`str`	The name of the country used in the spreadsheet for which you want to extract the data.	required
`version`	`str`	A string with the year/version of the file. Supported versions are 2012, 2015, 2019, 2024	required
`years`	`list[float]`	A list of years to get data for NOTE: Data files say it is for July 1st of the given year.	required
`filename`	`Union[str, Path]`	If not provided, the 'version' will be used to select from the known versions. If a filename is provided, it is assumed to be file from the UN World Pop website. It may have several sheets but the data will be extracted from the first two. The 'estimates' sheet gives you data for the past while the 'medium variant' sheet gives you the data for the future. It is expected to be in the Strict Open XML Spreadsheet format.	`None`

Returns:

Type	Description
`DataFrame`	It will return a pandas DataFrame where each row is a year and the columns are for an age range.

Source code in emodpy_hiv/demographics/un_world_pop.py

def extract_population_by_age(country: str,
                              version: str,
                              years: list[float],
                              filename: Union[str, Path] = None):
    """
    This code is for extracting [population by age data](https://population.un.org/wpp/Download/Standard/Population/)
    from the files downloaded from the UN World Pop.
    The code assumes that the files is in Strict Open XML Spreadsheet format. This format require
    us to use the 'calamine' engine to read the file. The file is expected to be for both sexes
    and to have 5 year age bins. (i.e. WPP2019_POP_F15_1_ANNUAL_POPULATION_BY_AGE_BOTH_SEXES.xlsx,
    WPP2024_POP_F02_1_POPULATION_5-YEAR_AGE_GROUPS_BOTH_SEXES.xlsx)

    Args:
        country (str):
            The name of the country used in the spreadsheet for which you want to extract the data.

        version (str):
            A string with the year/version of the file.  Supported versions are 2012, 2015, 2019, 2024

        years (list[float]):
            A list of years to get data for
            NOTE: Data files say it is for July 1st of the given year.

        filename (Union[str, Path]):
            If not provided, the 'version' will be used to select from the known versions. If a filename
            is provided, it is assumed to be file from the UN World Pop website. It may have several sheets
            but the data will be extracted from the first two. The 'estimates' sheet gives you data for the
            past while the 'medium variant' sheet gives you the data for the future. It is expected to be in
            the Strict Open XML Spreadsheet format.

    Returns:
        (pandas.DataFrame): It will return a pandas DataFrame where each row is a year and the columns are for an age range.
    """
    if filename is None:
        filename = _get_population_filename(version)
    _check_filename(filename)

    # ---------------------------------------------------------
    # --- Define column names and sheets to read the data from.
    # ---------------------------------------------------------
    COUNTRY_COL = "Major area, region, country or area *"
    PERIOD_COL  = "Reference date (as of 1 July)"  # noqa: E221
    AGE_00_04   = "0-4"                            # noqa: E221
    AGE_05_09   = "5-9"                            # noqa: E221
    AGE_10_14   = "10-14"                          # noqa: E221
    AGE_15_19   = "15-19"                          # noqa: E221
    AGE_20_24   = "20-24"                          # noqa: E221
    AGE_25_29   = "25-29"                          # noqa: E221
    AGE_30_34   = "30-34"                          # noqa: E221
    AGE_35_39   = "35-39"                          # noqa: E221
    AGE_40_44   = "40-44"                          # noqa: E221
    AGE_45_49   = "45-49"                          # noqa: E221
    AGE_50_54   = "50-54"                          # noqa: E221
    AGE_55_59   = "55-59"                          # noqa: E221
    AGE_60_64   = "60-64"                          # noqa: E221
    AGE_65_69   = "65-69"                          # noqa: E221
    AGE_70_74   = "70-74"                          # noqa: E221
    AGE_75_79   = "75-79"                          # noqa: E221
    AGE_80_84   = "80-84"                          # noqa: E221
    AGE_85_89   = "85-89"                          # noqa: E221
    AGE_90_94   = "90-94"                          # noqa: E221
    AGE_95_99   = "95-99"                          # noqa: E221
    AGE_100     = "100+"                           # noqa: E221
    AGE_80_PLUS = "80+"                            # noqa: E221

    sheet_list = []
    age_cols = [AGE_00_04, AGE_05_09, AGE_10_14, AGE_15_19,
                AGE_20_24, AGE_25_29, AGE_30_34, AGE_35_39,
                AGE_40_44, AGE_45_49, AGE_50_54, AGE_55_59,
                AGE_60_64, AGE_65_69, AGE_70_74, AGE_75_79,
                AGE_80_84, AGE_85_89, AGE_90_94, AGE_95_99,
                AGE_100]

    medium_cols = age_cols
    if version == "2012":
        sheet_list = ["ESTIMATES", "MEDIUM FERTILITY"]
        COUNTRY_COL = "Major area, region, country or area *"
        PERIOD_COL  = "Reference date (as of 1 July)" # noqa: E221
        medium_cols = age_cols.copy()
        age_cols.append(AGE_80_PLUS)
    elif version == "2015":
        sheet_list = ["ESTIMATES", "MEDIUM VARIANT"]
        COUNTRY_COL = "Major area, region, country or area *"
        PERIOD_COL  = "Reference date (as of 1 July)" # noqa: E221
        medium_cols = age_cols.copy()
        age_cols.append(AGE_80_PLUS)
    elif version == "2019":
        sheet_list = ["ESTIMATES", "MEDIUM VARIANT"]
        COUNTRY_COL = "Region, subregion, country or area *"
        PERIOD_COL  = "Reference date (as of 1 July)" # noqa: E221
    elif version == "2024":
        sheet_list = ["Estimates", "Medium variant"]
        COUNTRY_COL = "Region, subregion, country or area *"
        PERIOD_COL  = "Year" # noqa: E221
    else:
        raise ValueError("version '{0}' is not supported.\nOnly 2012, 2015, 2019, 2024 formats are supported".format(version))

    # --------------------------------
    # --- Extract data from the sheets
    # --------------------------------
    df = pd.DataFrame()
    for sheet in sheet_list:
        cols_to_read = [COUNTRY_COL, PERIOD_COL]
        if "estimate" in sheet.lower():
            cols_to_read.extend(age_cols)
        else:
            cols_to_read.extend(medium_cols)

        df_sheet = pd.read_excel(filename,
                                 sheet_name=sheet,
                                 skiprows=16,
                                 usecols=cols_to_read,
                                 engine="calamine")
        _check_country(country, df_sheet[COUNTRY_COL].unique(), filename)
        df_sheet = df_sheet[df_sheet[COUNTRY_COL] == country]
        df = pd.concat([df, df_sheet])
    df = df.drop(COUNTRY_COL, axis=1)

    # -----------------------------------
    # --- Get the rows of data we want
    # -----------------------------------
    _check_years(years, df[PERIOD_COL].unique(), filename)
    df = df[df[PERIOD_COL].isin(years)]

    # ---------------------------------------------------------------
    # --- In the 2012, 2015 data, the Estimates sheet has this "80+" column
    # --- that can be used instead of the 80-84, etc columns.
    # --- NOTE: This is crappy logic, but I need to move on.
    # ---------------------------------------------------------------
    if version == "2012" or version == "2015":
        val = df[AGE_80_PLUS].iloc[0]
        has_80p = isinstance(val, float) or isinstance(val, int)
        if has_80p:
            df = df.drop([AGE_80_84, AGE_85_89, AGE_90_94, AGE_95_99, AGE_100], axis=1)
            for col in [AGE_80_84, AGE_85_89, AGE_90_94, AGE_95_99, AGE_100]:
                age_cols.remove(col)
        else:
            df = df.drop(AGE_80_PLUS, axis=1)
            age_cols.remove(AGE_80_PLUS)

    # -----------------------------------------------------------
    # --- rename the age columns so they are just the minimum age
    # -----------------------------------------------------------
    new_col_names = {}
    for col in age_cols:
        if col == AGE_80_PLUS:
            new_col_names[col] = "80"
        elif len(col) == 3:
            new_col_names[col] = col[0:1]
        elif len(col) == 5:
            new_col_names[col] = col[0:2]
        else:
            new_col_names[col] = col[0:3]
    df.rename(columns=new_col_names, inplace=True)

    # ------------------------------------------------------------------
    # --- Some columns can have the "...".  This changes those to zeros.
    # --- (This could go away with the 80+ handling.)
    # ------------------------------------------------------------------
    #  df[year] = ( pd.to_numeric(df[year], errors='coerce' ).fillna(0) )

    # -----------------------------------------------------------------
    # --- The data is in thousands of people so multiply by 1000
    # --- and convert to integer since we want the value as an integer.
    # -----------------------------------------------------------------
    for column in df.columns:
        if column != PERIOD_COL:
            df[column] = df[column] * 1000

    return df

`extract_population_by_age_and_distribution(country, version, year=1960, filename=None)`

This code is for extracting population by age data from the files downloaded from the UN World Pop. The code assumes that the files is in Strict Open XML Spreadsheet format. This format requires us to use the 'calamine' engine to read the file. The file is expected to be for both sexes and to have 5 year age bins. (i.e. WPP2019_POP_F15_1_ANNUAL_POPULATION_BY_AGE_BOTH_SEXES.xlsx, WPP2024_POP_F02_1_POPULATION_5-YEAR_AGE_GROUPS_BOTH_SEXES.xlsx)

Parameters:

Name	Type	Description	Default
`country`	`str`	The name of the country used in the spreadsheet for which you want to extract the data.	required
`version`	`str`	A string with the year/version of the file. Supported versions are 2012, 2015, 2019, 2024	required
`year`	`int`	The year in the data to get the total population and age distribution for. Default is 1960 NOTE: Data files say it is for July 1st of the given year.	`1960`
`filename`	`Union[str, Path]`	If not provided, the 'version' will be used to select from the known versions. If a filename is provided, it is assumed to be file from the UN World Pop website. It may have several sheets but the data will be only be extracted from the first sheet, 'estimates'. The 'estimates' sheet gives you data for the past. It is assumed that you are using this function to get the starting point of an EMOD simulation that starts in the past. It is expected to be in the Strict Open XML Spreadsheet format.	`None`

Returns:

Type	Description
`float`	The total population for the given year
`YearAgeRate`	A YearAgeRate object where the "rate" column contains the fraction of people in that particular year and age ranges.

Source code in emodpy_hiv/demographics/un_world_pop.py

def extract_population_by_age_and_distribution(country: str,
                                               version: str,
                                               year: int = 1960,
                                               filename: Union[str, Path] = None):
    """
    This code is for extracting [population by age data](https://population.un.org/wpp/Download/Standard/Population/)
    from the files downloaded from the UN World Pop.
    The code assumes that the files is in Strict Open XML Spreadsheet format. This format requires
    us to use the 'calamine' engine to read the file. The file is expected to be for both sexes
    and to have 5 year age bins. (i.e. WPP2019_POP_F15_1_ANNUAL_POPULATION_BY_AGE_BOTH_SEXES.xlsx,
    WPP2024_POP_F02_1_POPULATION_5-YEAR_AGE_GROUPS_BOTH_SEXES.xlsx)

    Args:
        country (str):
            The name of the country used in the spreadsheet for which you want to extract the data.

        version (str):
            A string with the year/version of the file. Supported versions are 2012, 2015, 2019, 2024

        year (int):
            The year in the data to get the total population and age distribution for. Default is 1960
            NOTE: Data files say it is for July 1st of the given year.

        filename (Union[str, Path]):
            If not provided, the 'version' will be used to select from the known versions. If a filename
            is provided, it is assumed to be file from the UN World Pop website. It may have several sheets
            but the data will be only be extracted from the first sheet, 'estimates'. The 'estimates' sheet
            gives you data for the past. It is assumed that you are using this function to get the starting
            point of an EMOD simulation that starts in the past. It is expected to be in the Strict Open XML
            Spreadsheet format.

    Returns:
        (float): The total population for the given year
        (YearAgeRate): A YearAgeRate object where the "rate" column contains the fraction of people in that particular year and age ranges.
    """
    if filename is None:
        filename = _get_population_filename(version)
    _check_filename(filename)

    # ---------------------------------------------------------
    # --- Define column names and sheets to read the data from.
    # ---------------------------------------------------------
    COUNTRY_COL = "Major area, region, country or area *"
    PERIOD_COL  = "Reference date (as of 1 July)"  # noqa: E221
    AGE_00_04   = "0-4"                            # noqa: E221
    AGE_05_09   = "5-9"                            # noqa: E221
    AGE_10_14   = "10-14"                          # noqa: E221
    AGE_15_19   = "15-19"                          # noqa: E221
    AGE_20_24   = "20-24"                          # noqa: E221
    AGE_25_29   = "25-29"                          # noqa: E221
    AGE_30_34   = "30-34"                          # noqa: E221
    AGE_35_39   = "35-39"                          # noqa: E221
    AGE_40_44   = "40-44"                          # noqa: E221
    AGE_45_49   = "45-49"                          # noqa: E221
    AGE_50_54   = "50-54"                          # noqa: E221
    AGE_55_59   = "55-59"                          # noqa: E221
    AGE_60_64   = "60-64"                          # noqa: E221
    AGE_65_69   = "65-69"                          # noqa: E221
    AGE_70_74   = "70-74"                          # noqa: E221
    AGE_75_79   = "75-79"                          # noqa: E221
    AGE_80_84   = "80-84"                          # noqa: E221
    AGE_85_89   = "85-89"                          # noqa: E221
    AGE_90_94   = "90-94"                          # noqa: E221
    AGE_95_99   = "95-99"                          # noqa: E221
    AGE_100     = "100+"                           # noqa: E221
    AGE_80_PLUS = "80+"                            # noqa: E221

    sheet_list = []
    age_cols = [AGE_00_04, AGE_05_09, AGE_10_14, AGE_15_19,
                AGE_20_24, AGE_25_29, AGE_30_34, AGE_35_39,
                AGE_40_44, AGE_45_49, AGE_50_54, AGE_55_59,
                AGE_60_64, AGE_65_69, AGE_70_74, AGE_75_79,
                AGE_80_84, AGE_85_89, AGE_90_94, AGE_95_99,
                AGE_100]

    if version == "2012" or version == "2015":
        sheet_list = ["ESTIMATES"] # "MEDIUM FERTILITY"
        COUNTRY_COL = "Major area, region, country or area *"
        PERIOD_COL  = "Reference date (as of 1 July)" # noqa: E221
        age_cols.append(AGE_80_PLUS)
    elif version == "2019":
        sheet_list = ["ESTIMATES"] # "MEDIUM VARIANT"
        COUNTRY_COL = "Region, subregion, country or area *"
        PERIOD_COL  = "Reference date (as of 1 July)" # noqa: E221
    elif version == "2024":
        sheet_list = ["Estimates"] # "Medium variant"
        COUNTRY_COL = "Region, subregion, country or area *"
        PERIOD_COL  = "Year" # noqa: E221
    else:
        raise ValueError("version '{0}' is not supported.\nOnly 2012, 2015, 2019, 2024 formats are supported".format(version))

    cols_to_read = [COUNTRY_COL, PERIOD_COL]
    cols_to_read.extend(age_cols)

    # --------------------------------
    # --- Extract data from the sheets
    # --------------------------------
    df = pd.DataFrame()
    for sheet in sheet_list:
        df_sheet = pd.read_excel(filename,
                                 sheet_name=sheet,
                                 skiprows=16,
                                 usecols=cols_to_read,
                                 engine="calamine")
        _check_country(country, df_sheet[COUNTRY_COL].unique(), filename)
        df_sheet = df_sheet[df_sheet[COUNTRY_COL] == country]
        df = pd.concat([df, df_sheet])
    df = df.drop(COUNTRY_COL, axis=1)

    # -----------------------------------
    # --- Get the one row of data we want
    # -----------------------------------
    df = df[df[PERIOD_COL] == year]

    # ---------------------------------------------------------------
    # --- In the 2012, 2015 data, the Estimates sheet has this "80+" column
    # --- that can be used instead of the 80-84, etc columns.
    # --- NOTE: This is crappy logic, but I need to move on.
    # ---------------------------------------------------------------
    if version == "2012" or version == "2015":
        val = df[AGE_80_PLUS].iloc[0]
        has_80p = isinstance(val, float) or isinstance(val, int)
        if has_80p:
            df = df.drop([AGE_80_84, AGE_85_89, AGE_90_94, AGE_95_99, AGE_100], axis=1)
            for col in [AGE_80_84, AGE_85_89, AGE_90_94, AGE_95_99, AGE_100]:
                age_cols.remove(col)
        else:
            df = df.drop(AGE_80_PLUS, axis=1)
            age_cols.remove(AGE_80_PLUS)

    # -----------------------------------------------------------
    # --- rename the age columns so they are just the minimum age
    # -----------------------------------------------------------
    new_col_names = {}
    for col in age_cols:
        if col == AGE_80_PLUS:
            new_col_names[col] = "80"
        elif len(col) == 3:
            new_col_names[col] = col[0:1]
        elif len(col) == 5:
            new_col_names[col] = col[0:2]
        else:
            new_col_names[col] = col[0:3]
    df.rename(columns=new_col_names, inplace=True)

    # -------------------------------------------------------------------
    # --- Transpose the data so we have rows of min ages for our one Year
    # -------------------------------------------------------------------
    df = df.set_index(PERIOD_COL, inplace=False)
    df = df.transpose()
    df = df.rename_axis("tmp_min_age")

    # ------------------------------------------------------------------
    # --- Some columns can have the "...".  This changes those to zeros.
    # --- (This could go away with the 80+ handling.)
    # ------------------------------------------------------------------
    df[year] = (pd.to_numeric(df[year], errors='coerce').fillna(0))

    # ---------------------------------------------------------------
    # --- Change the data to the fraction of people in that age range
    # --- since this is supposed to be an age distribution.
    # ---------------------------------------------------------------
    total_pop = df[year].sum()
    df = df / total_pop

    # -----------------------------------------------------------------
    # --- The data is in thousands of people so multiply by 1000
    # --- and convert to integer since we want the value as an integer.
    # -----------------------------------------------------------------
    total_pop = int(1000 * total_pop)

    # --------------------------------------------------
    # --- Convert dataframe into a YearAgeRate dataframe
    # --------------------------------------------------
    df[YearAgeRate.COL_NAME_MIN_AGE ] = df.index.values                                 # noqa: E202
    df[YearAgeRate.COL_NAME_MIN_AGE ] = df[YearAgeRate.COL_NAME_MIN_AGE].astype(float)  # noqa: E202
    df[YearAgeRate.COL_NAME_NODE_ID ] = 0                                               # noqa: E202
    df[YearAgeRate.COL_NAME_MIN_YEAR] = year
    df.rename({year: YearAgeRate.COL_NAME_RATE}, axis=1, inplace=True)

    df[YearAgeRate.COL_NAME_RATE] = df[YearAgeRate.COL_NAME_RATE].astype(float).round(8)
    df = df[YearAgeRate.COL_NAMES]
    df.reset_index()

    return total_pop, YearAgeRate(df)