Skip to content

infer_natural_mortality

infer_natural_mortality(year_age_rate_data, interval_fit=None, predict_horizon=2100.0)

Convert the raw mortality data to what would have been the expected natural mortality had HIV not caused a spike in mortality. This algorithm assumes that during the 'interval_fit' mortality rates had a relatively steady decline (i.e. less people were dying each year), but shortly after the 'interval_fit' there was a spike in mortality. The algorithm uses the data in the 'interval_fit' and extrapolates it out over the time period.

It assumes that the input data is for only one node.

Parameters:

Name Type Description Default
year_age_rate_data YearAgeRate

This is a YearAgeRate data object containing the raw mortality data, probably output from the extract_mortality() function

required
interval_fit tuple[float, float]

This tuple contains the range of years that we want to determine the mortality trend before the HIV epidemic. These years will be extrapolated from the end of the fit forward.

None
predict_horizon float

This determines how far out the data will be extrapolated.

2100.0

Returns:

Type Description
YearAgeRate

A YearAgeRate object with the inferred mortality rates.

Source code in emodpy_hiv/demographics/infer_natural_mortality.py
def infer_natural_mortality(year_age_rate_data: YearAgeRate,
                            interval_fit: tuple[float, float] = None,
                            predict_horizon: float = 2100.0) -> YearAgeRate:
    """
    Convert the raw mortality data to what would have been the expected natural mortality had HIV
    not caused a spike in mortality.  This algorithm assumes that during the 'interval_fit'
    mortality rates had a relatively steady decline (i.e. less people were dying each year),
    but shortly after the 'interval_fit' there was a spike in mortality.  The algorithm uses
    the data in the 'interval_fit' and extrapolates it out over the time period.

    It assumes that the input data is for only one node.

    Args:
        year_age_rate_data: This is a YearAgeRate data object containing the raw mortality data,
            probably output from the **extract_mortality()** function
        interval_fit: This tuple contains the range of years that we want to determine the
            mortality trend before the HIV epidemic.  These years will be extrapolated
            from the end of the fit forward.
        predict_horizon: This determines how far out the data will be extrapolated.

    Returns:
        A YearAgeRate object with the inferred mortality rates.
    """

    if len(year_age_rate_data.df[YearAgeRate.COL_NAME_NODE_ID].unique().tolist()) != 1:
        node_id_list = year_age_rate_data.df[YearAgeRate.COL_NAME_NODE_ID].unique().tolist()
        msg  =  "Invalid number of nodes in the YearAgeRate object.\n"         # noqa: E221, E222
        msg +=  "The algorithm currently only supports one node at a time.\n"  # noqa: E222
        msg += f"The input has the following nodes: {node_id_list}"
        raise ValueError(msg)

    if interval_fit is None:
        interval_fit = (1970, 1980)
    elif interval_fit[1] <= interval_fit[0]:
        msg  =  "Invalid 'interval_fit' values.\n" # noqa: E221, E222
        msg += f"The first value in the tuple, {interval_fit[0]} must be strictly less\n"
        msg += f"than the second value, {interval_fit[1]}."
        raise ValueError(msg)

    if predict_horizon <= interval_fit[1]:
        msg  = "Invalid value of 'predict_horizon' with respect to 'interval_fit'.\n" # noqa: E221
        msg += f"The 'predicted_horizon' (={predict_horizon}) must be greater than\n"
        msg += f"the second value of the 'interval_fit[1]' (={interval_fit[1]})."

    df_mort = year_age_rate_data.df.copy()

    # ---------------------------------------------------
    # --- Log transform the data and sort by year and age
    # ---------------------------------------------------
    df_mort[YearAgeRate.COL_NAME_RATE] = df_mort[YearAgeRate.COL_NAME_RATE].apply(lambda x: np.log(x))
    df_mort.sort_values([YearAgeRate.COL_NAME_MIN_YEAR, YearAgeRate.COL_NAME_MIN_AGE], inplace=True)

    # -------------------------------------------------------------------
    # --- Extract the data for that exists BEFORE the reference interval.
    # --- Do not include any data from the interval
    # --- inclusive=left means the right is not inclusive.
    # -------------------------------------------------------------------
    min_year_col = df_mort[YearAgeRate.COL_NAME_MIN_YEAR]
    df_before_time = df_mort[min_year_col.between(0, interval_fit[0], inclusive='left')].copy()

    # -------------------------------------------------------------------------
    # --- Get list of ages to extrapolate the data for and setup the dataframe
    # -------------------------------------------------------------------------
    age_list = df_mort[YearAgeRate.COL_NAME_MIN_AGE].unique().tolist()
    df_mort.set_index([YearAgeRate.COL_NAME_MIN_AGE], inplace=True)

    df_list = []
    for age in age_list:
        tmp_data = df_mort.loc[age, :]
        extrap_model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

        # -------------------------------------------------------------------------------------------------
        # --- Get the data (log(rate)) for the reference interval and from the beginning of the reference
        # --- interval all the way to the end of the prediction.
        # -------------------------------------------------------------------------------------------------
        min_year_col = tmp_data[YearAgeRate.COL_NAME_MIN_YEAR]
        first_extrap_df = tmp_data[min_year_col.between(interval_fit[0], interval_fit[1])]
        xx              = tmp_data[min_year_col.between(interval_fit[0], predict_horizon)].values[:, 1] # noqa: E221

        # ----------------------------------------------------------------------------------------
        # --- Have the model fit the reference interval and then use that to predict the interval
        # --- from the beginning of the reference interval to the end.
        # ----------------------------------------------------------------------------------------
        values = first_extrap_df.values
        extrap_model.fit(values[:, 1].reshape(-1, 1), values[:, 2])
        extrap_predictions = extrap_model.predict(xx.reshape(-1, 1))

        # ----------------------------------------------------
        # --- Create a new dataframe with the new predictions
        # ----------------------------------------------------
        extrap_dict = {
            YearAgeRate.COL_NAME_MIN_AGE: age,
            YearAgeRate.COL_NAME_MIN_YEAR: xx,
            'Extrap': extrap_predictions
        }
        loc_df = pd.DataFrame.from_dict(extrap_dict)
        loc_df.set_index([YearAgeRate.COL_NAME_MIN_AGE, YearAgeRate.COL_NAME_MIN_YEAR], inplace=True)

        df_list.append(loc_df.copy())

    # ---------------------------------------------------------------
    # --- Concatenate the dataframes from the different ages together
    # ---------------------------------------------------------------
    df_e1 = pd.concat(df_list, axis=0)

    # -----------------------------------------------------------
    # --- Merge this datafame with the predictions with original
    # -----------------------------------------------------------
    df_list_final = [df_mort, df_e1]
    df_total = reduce(lambda left, right: pd.merge(left, right, on=[YearAgeRate.COL_NAME_MIN_AGE, YearAgeRate.COL_NAME_MIN_YEAR]), df_list_final)
    df_total = df_total.reset_index(inplace=False).set_index([YearAgeRate.COL_NAME_MIN_AGE], inplace=False)

    # ----------------------------------------------
    # --- Convert the data to rates from log(rates)
    # ----------------------------------------------
    df_total['Extrap'] = df_total['Extrap'].apply(np.exp)
    df_total['Data'] = df_total[YearAgeRate.COL_NAME_RATE].apply(np.exp)

    # ------------------------------------------------------------------
    # --- Convert the before time data to rates from log(rates) and
    # --- combine the data into one dataframe for the entire time period
    # ------------------------------------------------------------------
    df_before_time['Data'] = df_before_time[YearAgeRate.COL_NAME_RATE].apply(np.exp)
    df_before_time.set_index([YearAgeRate.COL_NAME_MIN_AGE], inplace=True)
    df_total = pd.concat([df_total, df_before_time], axis=0, join='outer', sort=True)
    df_total.reset_index(inplace=True)
    df_total.sort_values(by=[YearAgeRate.COL_NAME_MIN_YEAR, YearAgeRate.COL_NAME_MIN_AGE], inplace=True)

    # ----------------
    # --- Replace NaNs
    # ----------------
    def min_not_nan(x_list):
        loc_in = list(filter(lambda x: not np.isnan(x), x_list))
        return np.min(loc_in)
    df_total[YearAgeRate.COL_NAME_RATE] = df_total[['Data', 'Extrap']].apply(min_not_nan, axis=1)

    # ------------------------------------------------------
    # --- Convert dataframe to one suitable for YearAgeRate
    # ------------------------------------------------------
    df_total.drop(columns=['Data', 'Extrap'], inplace=True)
    df_total = df_total[YearAgeRate.COL_NAMES]
    df_total.sort_values(by=YearAgeRate.SORT_BY_COLUMNS, inplace=True)

    return YearAgeRate(df=df_total)

mortality_read_infer_plot(country, version, gender, interval_fit=None, save_data=False, other_csv_filename=None, img_dir=None, filename_to_save_to=None)

Extract the mortality data from the given file for the given country and plot both the raw data and the inferred data (without HIV deaths). The plot window will have a plot for each age where the data in the plot shows the rate per year for that age group. The label shows the minimum age of the band. This means that the age range being represented is from this value to the next largest age that is plotted.

Parameters:

Name Type Description Default
country str

The name of the country to be extracted. It must match exactly to the country name used in the referenced spreadsheet.

required
version str

The year/version of the indicated spreadsheet. Each year has a slightly different format. Supported versions are 2012, 2015, 2019, 2024

required
gender str

The gender of the data to be extracted. Possible values are 'male' and 'female'.

required
save_data bool

If true (default is False), two YearAgeRate CSV formatted files are output into the current directory. The files are: 'raw_mortality_year_age_rate.csv' and 'mortality_minus_hiv_year_age_rate.csv'.

False
other_csv_filename str

If the filename is defined, it reads an expected YearAgeRate CSV file and plots it with the raw and inferred data from the spreadsheet. This allows the user to plot other mortality data (i.e. older versions) with the new data.

None
img_dir str

If this is defined, the images are saved to this directory. If not defined, the images are displayed in a window.

None
filename_to_save_to str

The name of the file to save the image to. This is only used if img_dir is defined.

None

Return

No return

Source code in emodpy_hiv/demographics/infer_natural_mortality.py
def mortality_read_infer_plot(country: str,
                              version: str,
                              gender: str,
                              interval_fit: tuple[float, float] = None,
                              save_data: bool = False,
                              other_csv_filename: str = None,
                              img_dir: str = None,
                              filename_to_save_to: str = None) -> None:
    """
    Extract the mortality data from the given file for the given country and plot
    both the raw data and the inferred data (without HIV deaths).  The plot window will
    have a plot for each age where the data in the plot shows the rate per year for
    that age group.  The label shows the minimum age of the band.  This means that the
    age range being represented is from this value to the next largest age that is plotted.

    Args:
        country: The name of the country to be extracted.  It must match exactly to the
            country name used in the referenced spreadsheet.

        version: The year/version of the indicated spreadsheet.  Each year has a slightly
            different format.  Supported versions are 2012, 2015, 2019, 2024

        gender:
            The gender of the data to be extracted.  Possible values are 'male' and 'female'.

        save_data: If true (default is False), two YearAgeRate CSV formatted files are
            output into the current directory.  The files are: 'raw_mortality_year_age_rate.csv'
            and 'mortality_minus_hiv_year_age_rate.csv'.

        other_csv_filename: If the filename is defined, it reads an expected YearAgeRate CSV
            file and plots it with the raw and inferred data from the spreadsheet.
            This allows the user to plot other mortality data (i.e. older versions)
            with the new data.

        img_dir: If this is defined, the images are saved to this directory.  If not defined,
            the images are displayed in a window.

        filename_to_save_to: The name of the file to save the image to.  This is only used if
            img_dir is defined.

    Return:
        No return
    """
    raw_mortality_yar = unwp.extract_mortality(country=country,
                                               version=version,
                                               gender=gender)
    mortality_minus_hiv_yar = infer_natural_mortality(year_age_rate_data=raw_mortality_yar,
                                                      interval_fit=interval_fit,
                                                      predict_horizon=2100)

    if save_data:
        raw_mortality_yar.to_csv("raw_mortality_year_age_rate.csv")
        mortality_minus_hiv_yar.to_csv("mortality_minus_hiv_year_age_rate.csv")

    plot_list = []
    plot_list.append(raw_mortality_yar)
    plot_list.append(mortality_minus_hiv_yar)

    if other_csv_filename:
        other_yar = YearAgeRate(csv_filename=other_csv_filename)
        plot_list.append(other_yar)

    title = f"Red: UN World Population, country={country}, version={version}, gender={gender}\n"
    title += f"Blue: Natural mortality inferred from {interval_fit[0]} to {interval_fit[1]}"
    year_age_rate.plot(year_age_rate_list=plot_list,
                       title=title,
                       node_id=0,
                       img_dir=img_dir,
                       filename_to_save_to=filename_to_save_to)

mortality_read_infer_plot_app(country, version, gender, min_year, max_year, save_data=False, other_csv_filename=None, img_dir=None)

Using the 'version' and 'gender', select the UN World Population mortality data file. Extract the mortality data from the selected file for the given country and plot both the raw data and the inferred data (without HIV deaths). The plot will have a plot for each age where the data in the plot shows the rate per year for that age group. The label shows the minimum age of the band. This means that the age range being represented is from this value to the next largest age that is plotted.

Parameters:

Name Type Description Default
country str

The name of the country to be extracted. It must match exactly to the country name used in the referenced spreadsheet.

required
version str

The year/version of the indicated spreadsheet. Each year has a slightly different format. Supported versions are 2012, 2015, 2019, 2024

required
gender str

The gender of the data to be extracted. Possible values are 'male' and 'female'.

required
min_year int

The start year of the interval to fit the data to.

required
max_year int

The end year of the interval to fit the data to.

required
save_data bool

If true (default is False), two YearAgeRate CSV formatted files are output into the current directory. The files are: 'raw_mortality_year_age_rate.csv' and 'mortality_minus_hiv_year_age_rate.csv'.

False
other_csv_filename str

If the filename is defined, it reads an expected YearAgeRate CSV file and plots it with the raw and inferred data from the spreadsheet. This allows the user to plot other mortality data (i.e. older versions) with the new data.

None
img_dir str

If this is defined, the images are saved to this directory. If not defined, the images are displayed in a window.

None

Return

No return

Source code in emodpy_hiv/demographics/infer_natural_mortality.py
def mortality_read_infer_plot_app(country: str,
                                  version: str,
                                  gender: str,
                                  min_year: int,
                                  max_year: int,
                                  save_data: bool = False,
                                  other_csv_filename: str = None,
                                  img_dir: str = None) -> None:
    """
    Using the 'version' and 'gender', select the UN World Population mortality data file.
    Extract the mortality data from the selected file for the given country and plot
    both the raw data and the inferred data (without HIV deaths).  The plot will
    have a plot for each age where the data in the plot shows the rate per year for
    that age group.  The label shows the minimum age of the band.  This means that the
    age range being represented is from this value to the next largest age that is plotted.

    Args:
        country: The name of the country to be extracted.  It must match exactly to the
            country name used in the referenced spreadsheet.

        version: The year/version of the indicated spreadsheet.  Each year has a slightly
            different format.  Supported versions are 2012, 2015, 2019, 2024

        gender: The gender of the data to be extracted.  Possible values are 'male' and 'female'.

        min_year: The start year of the interval to fit the data to.

        max_year: The end year of the interval to fit the data to.

        save_data: If true (default is False), two YearAgeRate CSV formatted files are
            output into the current directory.  The files are: 'raw_mortality_year_age_rate.csv'
            and 'mortality_minus_hiv_year_age_rate.csv'.

        other_csv_filename: If the filename is defined, it reads an expected YearAgeRate CSV
            file and plots it with the raw and inferred data from the spreadsheet.
            This allows the user to plot other mortality data (i.e. older versions)
            with the new data.

        img_dir: If this is defined, the images are saved to this directory.  If not defined,
            the images are displayed in a window.

    Return:
        No return
    """
    if min_year >= max_year:
        raise ValueError(f"Invalid interval: {min_year} >= {max_year}.\n"
                         + "The min year must be less than the max year.")
    if min_year < 1950:
        raise ValueError(f"Invalid interval: {min_year} < 1950.\n"
                         + "The min year must be greater than or equal to 1950,\n"
                         + "because the data is not present in the files.")
    if max_year > 2100:
        raise ValueError(f"Invalid interval: {max_year} > 2100.\n"
                         + "The max year must be less than or equal to 2100,\n"
                         + "because the data is not present in the files.")

    filename_to_save_to = f"Inferred_Mortality_{gender}_{country}_{version}_from_{min_year}_to_{max_year}.png"
    filename_to_save_to = filename_to_save_to.replace(" ", "_")

    mortality_read_infer_plot(country=country,
                              version=version,
                              gender=gender,
                              interval_fit=(min_year, max_year),
                              save_data=save_data,
                              other_csv_filename=other_csv_filename,
                              img_dir=img_dir,
                              filename_to_save_to=filename_to_save_to)