Skip to content

plot_relationship_end

extract_data_for_relationship(filename, relationship_type)

Extract the relationship duration information for the given relationship type in the given file. Please note that only relationships that "broke-up" are considered because those are the relationships that went to the completion of the drawn duration. The relationship could have ended prematurely due to things like death or a partner migrating away.

Parameters:

Name Type Description Default
filename (str, required)

The path and name of the RelationshipEnd.csv to be read.

required
relationship_type (int, required)

The type of relationship. Options: 0 (transitory), 1 (informal), 2 (marital), 3 (commercial).

required

Returns:

Type Description
DataFrame

Dataframe where the rows must be of the given relationship type and with the extra column of the actual relationship duration.

Source code in emodpy_hiv/plotting/plot_relationship_end.py
def extract_data_for_relationship(filename: str,
                                  relationship_type: int):
    """
    Extract the relationship duration information for the given relationship type in the given file.
    Please note that only relationships that "broke-up" are considered because those are the relationships
    that went to the completion of the drawn duration.  The relationship could have ended prematurely due
    to things like death or a partner migrating away.

    Args:
        filename (str, required):
            The path and name of the RelationshipEnd.csv to be read.

        relationship_type (int, required):
            The type of relationship. Options: 0 (transitory), 1 (informal), 2 (marital), 3 (commercial).

    Returns:
        (pd.DataFrame): Dataframe where the rows must be of the given relationship type and with the extra column
            of the actual relationship duration.
    """
    df = pd.read_csv(filename)

    if COL_NAME_REL_TYPE not in df.columns:
        raise ValueError(f"'{COL_NAME_REL_TYPE}' column does not exist in the file({filename}).")

    if relationship_type not in df[COL_NAME_REL_TYPE].unique():
        raise ValueError(f"'{relationship_type}' is not a valid relationship type in the file({filename}).")

    df = df[ df[COL_NAME_REL_TYPE   ] == relationship_type ]  # noqa: E201, E202
    df = df[ df[COL_NAME_TERMINATION] == TR_BROKEUP        ]  # noqa: E201, E202

    df[TMP_COL_NAME_DURATION] = df[COL_NAME_END_TYPE_ACT] - df[COL_NAME_START_TIME]

    return df

plot_relationship_duration_histogram(dir_or_filename, relationship_type, bin_size, expected=None, exp_avg=None, heterogeneity=None, scale=None, show_avg_per_run=False, img_dir=None)

Plot the relationship duration histogram for the given relationship type and show information in the title about the expected Weibull distribution. Please note that only relationships that "broke-up" are considered because those are the relationships that went to the completion of the drawn duration. The relationship could have ended prematurely due to things like death or a partner migrating away.

Parameters:

Name Type Description Default
dir_or_filename (str, required)

The directory or filename containing the RelationshipEnd.csv files.

required
relationship_type (int, required)

The type of relationship. Options: 0 (transitory), 1 (informal), 2 (marital), 3 (commercial).

required
bin_size (float, required)

The size of the bins for the histogram.

required
expected list

Expected values for the Weibull distribution. There must be 16 values.

None
exp_avg float

Expected average duration in days. Will be shown in the title.

None
heterogeneity float

Heterogeneity parameter for the Weibull distribution. Will be show in the title.

None
scale float

Scale parameter for the Weibull distribution. Will be show in the title.

None
show_avg_per_run bool

Whether to show the average duration per run. Will be show in the title.

False
img_dir str

Directory to save the images. If None, the images will not be saved and a window will be opened.

None

Returns:

Source code in emodpy_hiv/plotting/plot_relationship_end.py
def plot_relationship_duration_histogram(dir_or_filename: str,
                                         relationship_type: int,
                                         bin_size: float,
                                         expected: list[float] = None,
                                         exp_avg: float = None,
                                         heterogeneity: float = None,
                                         scale: float = None,
                                         show_avg_per_run: bool = False,
                                         img_dir: str = None):
    """
    Plot the relationship duration histogram for the given relationship type and
    show information in the title about the expected Weibull distribution.
    Please note that only relationships that "broke-up" are considered because those are the relationships
    that went to the completion of the drawn duration.  The relationship could have ended prematurely due
    to things like death or a partner migrating away.

    Args:
        dir_or_filename (str, required):
            The directory or filename containing the RelationshipEnd.csv files.

        relationship_type (int, required):
            The type of relationship. Options: 0 (transitory), 1 (informal), 2 (marital), 3 (commercial).

        bin_size (float, required):
            The size of the bins for the histogram.

        expected (list, optional):
            Expected values for the Weibull distribution.  There must be 16 values.

        exp_avg (float, optional):
            Expected average duration in days.  Will be shown in the title.

        heterogeneity (float, optional):
            Heterogeneity parameter for the Weibull distribution.  Will be show in the title.

        scale (float, optional):
            Scale parameter for the Weibull distribution.  Will be show in the title.

        show_avg_per_run (bool, optional):
            Whether to show the average duration per run.  Will be show in the title.

        img_dir (str, optional):
            Directory to save the images. If None, the images will not be saved and a window will be opened.

    Returns:
    """

    # -------------------------------------------------------
    # Create the array of bins given the bin size
    # The bins are bin_size, 2*bin_size, ..., 16*bin_size,
    # They are the maximum value of the bin.
    # I selected 16 bins because it seemed like you saw the distribution well.
    # -------------------------------------------------------
    num_bins = 16
    bins = []
    this_bin = bin_size
    for bin_index in range(num_bins):
        bins.append(this_bin)
        this_bin = this_bin + bin_size

    if (expected is not None) and (len(expected) != 16):
        raise ValueError("The 'expected' Weibull distribution histogram is expected to have 16 values.")

    # ------------------------------
    # Create the labels for the bins
    # ------------------------------
    bin_label_list = []
    for bin_index, this_bin in enumerate(bins):
        if bin_index == 0:
            label = f"0-{this_bin}"
        else:
            label = f"{bins[bin_index - 1]}-{this_bin}"
        bin_label_list.append(label)

    # -----------------------------------------
    # Get the list of files in the directory
    # If a single file is given, use that file
    # -----------------------------------------
    dir_filenames = helpers.get_filenames(dir_or_filename=dir_or_filename,
                                          file_prefix="RelationshipEnd",
                                          file_extension=".csv")

    # --------------------------------------------------------------------------------------
    # Extract the duration data out of each file and determine the histogram
    # of relationship duration.  The histogram is the fraction of relationships in each bin.
    # --------------------------------------------------------------------------------------
    total = 0
    total_count = 0
    histogram_list = []
    for fn in dir_filenames:
        df = extract_data_for_relationship(filename=fn,
                                           relationship_type=relationship_type)
        count_sum = 0
        count_histogram = []
        for this_bin in bins:
            count_histogram.append(0)

        for item in df[TMP_COL_NAME_DURATION]:
            total = total + item
            for bin_index, this_bin in enumerate(bins):
                if (item < this_bin) or (this_bin == bins[len(bins) - 1]):
                    count_histogram[bin_index] = count_histogram[bin_index] + 1
                    count_sum = count_sum + 1
                    break
        histogram = []
        for count in count_histogram:
            histogram.append(count / count_sum)
        histogram_list.append(histogram)
        total_count = total_count + count_sum

    # Calculate average for all relationships in all files
    act_avg = total / total_count

    # ------------------------------------------------------------------------------------
    # Create the dataframe to plot and make the index the bin labels and put the histogram
    # into the data frame.  If not showing the average, then there should be one column for
    # each file.  If showing the average, we want one column with the average of each bin.
    # ------------------------------------------------------------------------------------
    df_hist = pd.DataFrame()
    df_hist["Label"] = bin_label_list
    df_hist.index = df_hist["Label"]
    del df_hist["Label"]

    for hist_index, histogram in enumerate(histogram_list):
        df_hist["Duration-" + str(hist_index)] = histogram

    if show_avg_per_run:
        column_names = df_hist.columns
        df_hist[TMP_COL_NAME_AVG_DUR] = 0
        for column_name in column_names:
            df_hist[TMP_COL_NAME_AVG_DUR] = df_hist[TMP_COL_NAME_AVG_DUR] + df_hist[column_name]
            del df_hist[column_name]
        df_hist[TMP_COL_NAME_AVG_DUR] = df_hist[TMP_COL_NAME_AVG_DUR] / len(dir_filenames)

    # -------------------------------------------------------------------------
    # Create the expected dataframe - Show Weibull distribution that the model
    # should have duplicated.
    # -------------------------------------------------------------------------
    expected_df = None
    if expected:
        expected_df = pd.DataFrame()
        expected_df.index = df_hist.index
        expected_df["Expected Duration"] = expected

    # ------------------------------
    # Create the title for the plot
    # ------------------------------
    rel_str = "TRANSITORY"
    if relationship_type == 1:
        rel_str = "INFORMAL"
    elif relationship_type == 2:
        rel_str = "MARITAL"
    elif relationship_type == 3:
        rel_str = "COMMERCIAL"

    title = ""
    if show_avg_per_run:
        title = title + "Average Duration per Run - "
    title = title + f"Relationship Duration Histogram - {rel_str}"

    title2 = f"Weibull Distribution - Scale={scale:0.2f} - Hetero={heterogeneity:0.2f} - Exp Avg={exp_avg:0.2f} - Act Avg={act_avg:0.2f}"

    # ---------------
    # Create the plot
    # ---------------
    xy_plot.xy_plot(img_dir=img_dir,
                    df=df_hist,
                    expected_df=expected_df,
                    title_1=title,
                    title_2=title2,
                    x_axis_name="Duration (days)",
                    y_axis_name="Fraction of Relationships",
                    show_legend=show_avg_per_run,
                    show_markers=show_avg_per_run,
                    fraction_of_total=False,
                    min_x=None, max_x=None, min_y=None, max_y=None,
                    x_axis_as_log_scale=False,
                    y_axis_as_log_scale=False)

plot_relationship_duration_histogram_with_expected(dir_or_filename, relationship_type='transitory', show_avg_per_run=False, show_expected=False, img_dir=None)

Plot the relationship duration histogram for the given relationship type. Please note that only relationships that "broke-up" are considered because those are the relationships that went to the completion of the drawn duration. The relationship could have ended prematurely due to things like death or a partner migrating away.

Parameters:

Name Type Description Default
dir_or_filename (str, required)

The directory or filename containing the RelationshipEnd.csv files.

required
relationship_type str

The type of relationship. Options: transitory, informal, marital, commercial. Default is "transitory".

'transitory'
show_avg_per_run bool

Whether to show the average duration per run. Default is False.

False
show_expected bool

Whether to show the expected Weibull distribution. Default is False.

False
img_dir str

Directory to save the images. If None, the images will not be saved and a window will be opened. Default is none - don't save image and open a window.

None

Returns:

Source code in emodpy_hiv/plotting/plot_relationship_end.py
def plot_relationship_duration_histogram_with_expected(dir_or_filename: str,
                                                       relationship_type: str = "transitory",
                                                       show_avg_per_run: bool = False,
                                                       show_expected: bool = False,
                                                       img_dir: str = None):
    """
    Plot the relationship duration histogram for the given relationship type.
    Please note that only relationships that "broke-up" are considered because those are the relationships
    that went to the completion of the drawn duration.  The relationship could have ended prematurely due
    to things like death or a partner migrating away.

    Args:
        dir_or_filename (str, required):
            The directory or filename containing the RelationshipEnd.csv files.

        relationship_type (str, optional):
            The type of relationship. Options: transitory, informal, marital, commercial.
            Default is "transitory".

        show_avg_per_run (bool, optional):
            Whether to show the average duration per run.
            Default is False.

        show_expected (bool, optional):
            Whether to show the expected Weibull distribution.
            Default is False.

        img_dir (str, optional):
            Directory to save the images.  If None, the images will not be saved and a window will be opened.
            Default is none - don't save image and open a window.

    Returns:
    """
    bin_size = None
    expected = None
    hetero   = None  # noqa: E221
    scale    = None  # noqa: E221
    exp_avg  = None  # noqa: E221
    rel_type = None

    # ------------------------------------------------------------------------------------------------
    # Expected array are the values of the Weibull distribution with the given scale and heterogeneity
    # One can generate these expected values using the C++ code in PrngTest.cpp
    # ------------------------------------------------------------------------------------------------
    if relationship_type == "transitory":
        rel_type = 0
        bin_size = 200
        expected = [0.401084, 0.291050, 0.160738, 0.080045,
                    0.037918, 0.016957, 0.007118, 0.003026,
                    0.001270, 0.000482, 0.000210, 0.000071,
                    0.000020, 0.000006, 0.000000, 0.000000]
        hetero = 0.833333333
        scale = 0.956774771214
        exp_avg = 328
    elif relationship_type == "informal":
        rel_type = 1
        bin_size = 200
        expected = [0.159120, 0.196236, 0.174445, 0.140089,
                    0.105469, 0.075556, 0.052367, 0.035005,
                    0.023288, 0.015160, 0.009357, 0.005755,
                    0.003228, 0.002049, 0.001199, 0.000774]
        hetero = 0.75
        scale = 2.03104913138
        exp_avg = 681
    elif relationship_type == "marital":
        rel_type = 2
        bin_size = 1500
        expected = [0.076216, 0.125540, 0.137884, 0.132489,
                    0.119088, 0.100110, 0.081648, 0.063388,
                    0.048465, 0.035221, 0.025370, 0.018314,
                    0.012766, 0.008317, 0.005764, 0.003480]
        hetero = 0.666666667
        scale = 22.154455184937
        exp_avg = 7299
    elif relationship_type == "commercial":
        rel_type = 3
        bin_size = 3
        expected = [0.348380, 0.227677, 0.147723, 0.096667,
                    0.062807, 0.040239, 0.026340, 0.017745,
                    0.011341, 0.007405, 0.004911, 0.002995,
                    0.001882, 0.001295, 0.000901, 0.000646]
        hetero = 1.0
        scale = 0.01917808219
        exp_avg = 7.0
    else:
        raise ValueError(f"Unknown relationship type = {relationship_type}.")

    if not show_expected:
        expected = None

    plot_relationship_duration_histogram(dir_or_filename=dir_or_filename,
                                         relationship_type=rel_type,
                                         bin_size=bin_size,
                                         expected=expected,
                                         exp_avg=exp_avg,
                                         heterogeneity=hetero,
                                         scale=scale,
                                         show_avg_per_run=show_avg_per_run,
                                         img_dir=img_dir)