infer_natural_mortality

`infer_natural_mortality(year_age_rate_data, interval_fit=None, predict_horizon=2100.0)`

Convert the raw mortality data to what would have been the expected natural mortality had HIV not caused a spike in mortality. This algorithm assumes that during the 'interval_fit' mortality rates had a relatively steady decline (i.e. less people were dying each year), but shortly after the 'interval_fit' there was a spike in mortality. The algorithm uses the data in the 'interval_fit' and extrapolates it out over the time period.

It assumes that the input data is for only one node.

Parameters:

Name	Type	Description	Default
`year_age_rate_data`	`YearAgeRate`	This is a YearAgeRate data object containing the raw mortality data, probably output from the extract_mortality() function	required
`interval_fit`	`tuple[float, float]`	This tuple contains the range of years that we want to determine the mortality trend before the HIV epidemic. These years will be extrapolated from the end of the fit forward.	`None`
`predict_horizon`	`float`	This determines how far out the data will be extrapolated.	`2100.0`

Returns:

Type	Description
`YearAgeRate`	A YearAgeRate object with the inferred mortality rates.

Source code in emodpy_hiv/demographics/infer_natural_mortality.py

def infer_natural_mortality(year_age_rate_data: YearAgeRate,
                            interval_fit: tuple[float, float] = None,
                            predict_horizon: float = 2100.0) -> YearAgeRate:
    """
    Convert the raw mortality data to what would have been the expected natural mortality had HIV
    not caused a spike in mortality. This algorithm assumes that during the 'interval_fit'
    mortality rates had a relatively steady decline (i.e. less people were dying each year),
    but shortly after the 'interval_fit' there was a spike in mortality. The algorithm uses
    the data in the 'interval_fit' and extrapolates it out over the time period.

    It assumes that the input data is for only one node.

    Args:
        year_age_rate_data (YearAgeRate): This is a YearAgeRate data object containing the raw mortality data,
            probably output from the **extract_mortality()** function
        interval_fit (tuple[float, float]): This tuple contains the range of years that we want to determine the
            mortality trend before the HIV epidemic. These years will be extrapolated
            from the end of the fit forward.
        predict_horizon (float): This determines how far out the data will be extrapolated.

    Returns:
        A YearAgeRate object with the inferred mortality rates.
    """

    if len(year_age_rate_data.df[YearAgeRate.COL_NAME_NODE_ID].unique().tolist()) != 1:
        node_id_list = year_age_rate_data.df[YearAgeRate.COL_NAME_NODE_ID].unique().tolist()
        msg  =  "Invalid number of nodes in the YearAgeRate object.\n"         # noqa: E221, E222
        msg +=  "The algorithm currently only supports one node at a time.\n"  # noqa: E222
        msg += f"The input has the following nodes: {node_id_list}"
        raise ValueError(msg)

    if interval_fit is None:
        interval_fit = (1970, 1980)
    elif interval_fit[1] <= interval_fit[0]:
        msg  =  "Invalid 'interval_fit' values.\n" # noqa: E221, E222
        msg += f"The first value in the tuple, {interval_fit[0]} must be strictly less\n"
        msg += f"than the second value, {interval_fit[1]}."
        raise ValueError(msg)

    if predict_horizon <= interval_fit[1]:
        msg  = "Invalid value of 'predict_horizon' with respect to 'interval_fit'.\n" # noqa: E221
        msg += f"The 'predicted_horizon' (={predict_horizon}) must be greater than\n"
        msg += f"the second value of the 'interval_fit[1]' (={interval_fit[1]})."

    df_mort = year_age_rate_data.df.copy()

    # ---------------------------------------------------
    # --- Log transform the data and sort by year and age
    # ---------------------------------------------------
    df_mort[YearAgeRate.COL_NAME_RATE] = df_mort[YearAgeRate.COL_NAME_RATE].apply(lambda x: np.log(x))
    df_mort.sort_values([YearAgeRate.COL_NAME_MIN_YEAR, YearAgeRate.COL_NAME_MIN_AGE], inplace=True)

    # -------------------------------------------------------------------
    # --- Extract the data for that exists BEFORE the reference interval.
    # --- Do not include any data from the interval
    # --- inclusive=left means the right is not inclusive.
    # -------------------------------------------------------------------
    min_year_col = df_mort[YearAgeRate.COL_NAME_MIN_YEAR]
    df_before_time = df_mort[min_year_col.between(0, interval_fit[0], inclusive='left')].copy()

    # -------------------------------------------------------------------------
    # --- Get list of ages to extrapolate the data for and setup the dataframe
    # -------------------------------------------------------------------------
    age_list = df_mort[YearAgeRate.COL_NAME_MIN_AGE].unique().tolist()
    df_mort.set_index([YearAgeRate.COL_NAME_MIN_AGE], inplace=True)

    df_list = []
    for age in age_list:
        tmp_data = df_mort.loc[age, :]
        extrap_model = make_pipeline(StandardScaler(with_mean=False), LinearRegression())

        # -------------------------------------------------------------------------------------------------
        # --- Get the data (log(rate)) for the reference interval and from the beginning of the reference
        # --- interval all the way to the end of the prediction.
        # -------------------------------------------------------------------------------------------------
        min_year_col = tmp_data[YearAgeRate.COL_NAME_MIN_YEAR]
        first_extrap_df = tmp_data[min_year_col.between(interval_fit[0], interval_fit[1])]
        xx              = tmp_data[min_year_col.between(interval_fit[0], predict_horizon)].values[:, 1] # noqa: E221

        # ----------------------------------------------------------------------------------------
        # --- Have the model fit the reference interval and then use that to predict the interval
        # --- from the beginning of the reference interval to the end.
        # ----------------------------------------------------------------------------------------
        values = first_extrap_df.values
        extrap_model.fit(values[:, 1].reshape(-1, 1), values[:, 2])
        extrap_predictions = extrap_model.predict(xx.reshape(-1, 1))

        # ----------------------------------------------------
        # --- Create a new dataframe with the new predictions
        # ----------------------------------------------------
        extrap_dict = {
            YearAgeRate.COL_NAME_MIN_AGE: age,
            YearAgeRate.COL_NAME_MIN_YEAR: xx,
            'Extrap': extrap_predictions
        }
        loc_df = pd.DataFrame.from_dict(extrap_dict)
        loc_df.set_index([YearAgeRate.COL_NAME_MIN_AGE, YearAgeRate.COL_NAME_MIN_YEAR], inplace=True)

        df_list.append(loc_df.copy())

    # ---------------------------------------------------------------
    # --- Concatenate the dataframes from the different ages together
    # ---------------------------------------------------------------
    df_e1 = pd.concat(df_list, axis=0)

    # -----------------------------------------------------------
    # --- Merge this datafame with the predictions with original
    # -----------------------------------------------------------
    df_list_final = [df_mort, df_e1]
    df_total = reduce(lambda left, right: pd.merge(left, right, on=[YearAgeRate.COL_NAME_MIN_AGE, YearAgeRate.COL_NAME_MIN_YEAR]), df_list_final)
    df_total = df_total.reset_index(inplace=False).set_index([YearAgeRate.COL_NAME_MIN_AGE], inplace=False)

    # ----------------------------------------------
    # --- Convert the data to rates from log(rates)
    # ----------------------------------------------
    df_total['Extrap'] = df_total['Extrap'].apply(np.exp)
    df_total['Data'] = df_total[YearAgeRate.COL_NAME_RATE].apply(np.exp)

    # ------------------------------------------------------------------
    # --- Convert the before time data to rates from log(rates) and
    # --- combine the data into one dataframe for the entire time period
    # ------------------------------------------------------------------
    df_before_time['Data'] = df_before_time[YearAgeRate.COL_NAME_RATE].apply(np.exp)
    df_before_time.set_index([YearAgeRate.COL_NAME_MIN_AGE], inplace=True)
    df_total = pd.concat([df_total, df_before_time], axis=0, join='outer', sort=True)
    df_total.reset_index(inplace=True)
    df_total.sort_values(by=[YearAgeRate.COL_NAME_MIN_YEAR, YearAgeRate.COL_NAME_MIN_AGE], inplace=True)

    # ----------------
    # --- Replace NaNs
    # ----------------
    def min_not_nan(x_list):
        loc_in = list(filter(lambda x: not np.isnan(x), x_list))
        return np.min(loc_in)
    df_total[YearAgeRate.COL_NAME_RATE] = df_total[['Data', 'Extrap']].apply(min_not_nan, axis=1)

    # ------------------------------------------------------
    # --- Convert dataframe to one suitable for YearAgeRate
    # ------------------------------------------------------
    df_total.drop(columns=['Data', 'Extrap'], inplace=True)
    df_total = df_total[YearAgeRate.COL_NAMES]
    df_total.sort_values(by=YearAgeRate.SORT_BY_COLUMNS, inplace=True)

    return YearAgeRate(df=df_total)

`mortality_read_infer_plot(country, version, gender, interval_fit=None, save_data=False, other_csv_filename=None, img_dir=None, filename_to_save_to=None)`

Extract the mortality data from the given file for the given country and plot both the raw data and the inferred data (without HIV deaths). The plot window will have a plot for each age where the data in the plot shows the rate per year for that age group. The label shows the minimum age of the band. This means that the age range being represented is from this value to the next largest age that is plotted.

Parameters:

Name	Type	Description	Default
`country`	`str`	The name of the country to be extracted. It must match exactly to the country name used in the referenced spreadsheet.	required
`version`	`str`	The year/version of the indicated spreadsheet. Each year has a slightly different format. Supported versions are 2012, 2015, 2019, 2024	required
`gender`	`str`	The gender of the data to be extracted. Possible values are 'male' and 'female'.	required
`interval_fit`	`tuple[float, float]`	This tuple contains the range of years that we want to determine the mortality trend before the HIV epidemic. These years will be extrapolated from the end of the fit forward.	`None`
`save_data`	`bool`	If true (default is False), two YearAgeRate CSV formatted files are output into the current directory. The files are: 'raw_mortality_year_age_rate.csv' and 'mortality_minus_hiv_year_age_rate.csv'.	`False`
`other_csv_filename`	`str`	If the filename is defined, it reads an expected YearAgeRate CSV file and plots it with the raw and inferred data from the spreadsheet. This allows the user to plot other mortality data (i.e. older versions) with the new data.	`None`
`img_dir`	`str`	If this is defined, the images are saved to this directory. If not defined, the images are displayed in a window.	`None`
`filename_to_save_to`	`str`	The name of the file to save the image to. This is only used if img_dir is defined.	`None`

Source code in emodpy_hiv/demographics/infer_natural_mortality.py

def mortality_read_infer_plot(country: str,
                              version: str,
                              gender: str,
                              interval_fit: tuple[float, float] = None,
                              save_data: bool = False,
                              other_csv_filename: str = None,
                              img_dir: str = None,
                              filename_to_save_to: str = None) -> None:
    """
    Extract the mortality data from the given file for the given country and plot
    both the raw data and the inferred data (without HIV deaths).  The plot window will
    have a plot for each age where the data in the plot shows the rate per year for
    that age group.  The label shows the minimum age of the band.  This means that the
    age range being represented is from this value to the next largest age that is plotted.

    Args:
        country (str): The name of the country to be extracted.  It must match exactly to the
            country name used in the referenced spreadsheet.

        version (str): The year/version of the indicated spreadsheet.  Each year has a slightly
            different format.  Supported versions are 2012, 2015, 2019, 2024

        gender (str):
            The gender of the data to be extracted.  Possible values are 'male' and 'female'.

        interval_fit (tuple[float, float]): This tuple contains the range of years that we want to determine the
            mortality trend before the HIV epidemic. These years will be extrapolated
            from the end of the fit forward.

        save_data (bool): If true (default is False), two YearAgeRate CSV formatted files are
            output into the current directory.  The files are: 'raw_mortality_year_age_rate.csv'
            and 'mortality_minus_hiv_year_age_rate.csv'.

        other_csv_filename (str): If the filename is defined, it reads an expected YearAgeRate CSV
            file and plots it with the raw and inferred data from the spreadsheet.
            This allows the user to plot other mortality data (i.e. older versions)
            with the new data.

        img_dir (str): If this is defined, the images are saved to this directory.  If not defined,
            the images are displayed in a window.

        filename_to_save_to (str): The name of the file to save the image to.  This is only used if
            img_dir is defined.
    """
    raw_mortality_yar = unwp.extract_mortality(country=country,
                                               version=version,
                                               gender=gender)
    mortality_minus_hiv_yar = infer_natural_mortality(year_age_rate_data=raw_mortality_yar,
                                                      interval_fit=interval_fit,
                                                      predict_horizon=2100)

    if save_data:
        raw_mortality_yar.to_csv("raw_mortality_year_age_rate.csv")
        mortality_minus_hiv_yar.to_csv("mortality_minus_hiv_year_age_rate.csv")

    plot_list = []
    plot_list.append(raw_mortality_yar)
    plot_list.append(mortality_minus_hiv_yar)

    if other_csv_filename:
        other_yar = YearAgeRate(csv_filename=other_csv_filename)
        plot_list.append(other_yar)

    title = f"Red: UN World Population, country={country}, version={version}, gender={gender}\n"
    title += f"Blue: Natural mortality inferred from {interval_fit[0]} to {interval_fit[1]}"
    year_age_rate.plot(year_age_rate_list=plot_list,
                       title=title,
                       node_id=0,
                       img_dir=img_dir,
                       filename_to_save_to=filename_to_save_to)

`mortality_read_infer_plot_app(country, version, gender, min_year, max_year, save_data=False, other_csv_filename=None, img_dir=None)`

Using the 'version' and 'gender', select the UN World Population mortality data file. Extract the mortality data from the selected file for the given country and plot both the raw data and the inferred data (without HIV deaths). The plot will have a plot for each age where the data in the plot shows the rate per year for that age group. The label shows the minimum age of the band. This means that the age range being represented is from this value to the next largest age that is plotted.

Parameters:

Name	Type	Description	Default
`country`	`str`	The name of the country to be extracted. It must match exactly to the country name used in the referenced spreadsheet.	required
`version`	`str`	The year/version of the indicated spreadsheet. Each year has a slightly different format. Supported versions are 2012, 2015, 2019, 2024	required
`gender`	`str`	The gender of the data to be extracted. Possible values are 'male' and 'female'.	required
`min_year`	`int`	The start year of the interval to fit the data to.	required
`max_year`	`int`	The end year of the interval to fit the data to.	required
`save_data`	`bool`	If true (default is False), two YearAgeRate CSV formatted files are output into the current directory. The files are: 'raw_mortality_year_age_rate.csv' and 'mortality_minus_hiv_year_age_rate.csv'.	`False`
`other_csv_filename`	`str`	If the filename is defined, it reads an expected YearAgeRate CSV file and plots it with the raw and inferred data from the spreadsheet. This allows the user to plot other mortality data (i.e. older versions) with the new data.	`None`
`img_dir`	`str`	If this is defined, the images are saved to this directory. If not defined, the images are displayed in a window.	`None`

Source code in emodpy_hiv/demographics/infer_natural_mortality.py

def mortality_read_infer_plot_app(country: str,
                                  version: str,
                                  gender: str,
                                  min_year: int,
                                  max_year: int,
                                  save_data: bool = False,
                                  other_csv_filename: str = None,
                                  img_dir: str = None) -> None:
    """
    Using the 'version' and 'gender', select the UN World Population mortality data file.
    Extract the mortality data from the selected file for the given country and plot
    both the raw data and the inferred data (without HIV deaths).  The plot will
    have a plot for each age where the data in the plot shows the rate per year for
    that age group.  The label shows the minimum age of the band.  This means that the
    age range being represented is from this value to the next largest age that is plotted.

    Args:
        country: The name of the country to be extracted.  It must match exactly to the
            country name used in the referenced spreadsheet.

        version: The year/version of the indicated spreadsheet.  Each year has a slightly
            different format.  Supported versions are 2012, 2015, 2019, 2024

        gender: The gender of the data to be extracted.  Possible values are 'male' and 'female'.

        min_year: The start year of the interval to fit the data to.

        max_year: The end year of the interval to fit the data to.

        save_data: If true (default is False), two YearAgeRate CSV formatted files are
            output into the current directory.  The files are: 'raw_mortality_year_age_rate.csv'
            and 'mortality_minus_hiv_year_age_rate.csv'.

        other_csv_filename: If the filename is defined, it reads an expected YearAgeRate CSV
            file and plots it with the raw and inferred data from the spreadsheet.
            This allows the user to plot other mortality data (i.e. older versions)
            with the new data.

        img_dir: If this is defined, the images are saved to this directory.  If not defined,
            the images are displayed in a window.
    """
    if min_year >= max_year:
        raise ValueError(f"Invalid interval: {min_year} >= {max_year}.\n"
                         + "The min year must be less than the max year.")
    if min_year < 1950:
        raise ValueError(f"Invalid interval: {min_year} < 1950.\n"
                         + "The min year must be greater than or equal to 1950,\n"
                         + "because the data is not present in the files.")
    if max_year > 2100:
        raise ValueError(f"Invalid interval: {max_year} > 2100.\n"
                         + "The max year must be less than or equal to 2100,\n"
                         + "because the data is not present in the files.")

    filename_to_save_to = f"Inferred_Mortality_{gender}_{country}_{version}_from_{min_year}_to_{max_year}.png"
    filename_to_save_to = filename_to_save_to.replace(" ", "_")

    mortality_read_infer_plot(country=country,
                              version=version,
                              gender=gender,
                              interval_fit=(min_year, max_year),
                              save_data=save_data,
                              other_csv_filename=other_csv_filename,
                              img_dir=img_dir,
                              filename_to_save_to=filename_to_save_to)