Skip to content

AggregateSelector

AggregateSelector

Source code in src/logos/aggregate_selector.py
class AggregateSelector:
    DEFAULT_AGGREGATES = {
        "num": [
            "mean",
            "max",
            "min",
        ],
        "str": [
            "last",
            "mode",
            "first",
        ],
    }

    def _entropy(col: pd.Series) -> float:
        """
        Calculates the entropy of a column.

        Parameters:
            col: The column for which to calculate the entropy.

        Returns:
            The entropy of `col`.
        """

        rel_value_counts = col.value_counts(normalize=True)
        if rel_value_counts.empty:
            return 0
        return -np.sum(rel_value_counts * np.log2(rel_value_counts))

    def find_uninformative_aggregates(
        prepared_log: pd.DataFrame, parsed_variables: pd.DataFrame, causal_unit_var: str
    ) -> list[str]:
        """
        Find aggregates that are uninformative for each column in `prepared_log`.
        Aggregates are uninformative unless they maximize the empirical entropy across causal units.

        Parameters:
            prepared_log: The prepared log.
            parsed_variables: The parsed variables.
            causal_unit_var: The name of the causal unit variable.

        Returns:
            A list of uninformative aggregates for `prepared_log`.
        """

        drop_list = []

        for row in parsed_variables.itertuples():
            aggs = row.Aggregates
            if len(aggs) == 0 or row.Name == causal_unit_var:
                continue

            vars = [f"{row.Name}+{agg}" for agg in aggs]
            best_var = f"{row.Name}+{AggregateSelector.DEFAULT_AGGREGATES[row.Type][0]}"
            max_entropy = -np.inf

            for var in vars:
                entropy = AggregateSelector._entropy(prepared_log[var])

                if entropy > max_entropy:
                    best_var = var
                    max_entropy = entropy

            drop_list.extend([var for var in vars if var != best_var])

        return drop_list

_entropy(col)

Calculates the entropy of a column.

Parameters:

Name Type Description Default
col Series

The column for which to calculate the entropy.

required

Returns:

Type Description
float

The entropy of col.

Source code in src/logos/aggregate_selector.py
def _entropy(col: pd.Series) -> float:
    """
    Calculates the entropy of a column.

    Parameters:
        col: The column for which to calculate the entropy.

    Returns:
        The entropy of `col`.
    """

    rel_value_counts = col.value_counts(normalize=True)
    if rel_value_counts.empty:
        return 0
    return -np.sum(rel_value_counts * np.log2(rel_value_counts))

find_uninformative_aggregates(prepared_log, parsed_variables, causal_unit_var)

Find aggregates that are uninformative for each column in prepared_log. Aggregates are uninformative unless they maximize the empirical entropy across causal units.

Parameters:

Name Type Description Default
prepared_log DataFrame

The prepared log.

required
parsed_variables DataFrame

The parsed variables.

required
causal_unit_var str

The name of the causal unit variable.

required

Returns:

Type Description
list[str]

A list of uninformative aggregates for prepared_log.

Source code in src/logos/aggregate_selector.py
def find_uninformative_aggregates(
    prepared_log: pd.DataFrame, parsed_variables: pd.DataFrame, causal_unit_var: str
) -> list[str]:
    """
    Find aggregates that are uninformative for each column in `prepared_log`.
    Aggregates are uninformative unless they maximize the empirical entropy across causal units.

    Parameters:
        prepared_log: The prepared log.
        parsed_variables: The parsed variables.
        causal_unit_var: The name of the causal unit variable.

    Returns:
        A list of uninformative aggregates for `prepared_log`.
    """

    drop_list = []

    for row in parsed_variables.itertuples():
        aggs = row.Aggregates
        if len(aggs) == 0 or row.Name == causal_unit_var:
            continue

        vars = [f"{row.Name}+{agg}" for agg in aggs]
        best_var = f"{row.Name}+{AggregateSelector.DEFAULT_AGGREGATES[row.Type][0]}"
        max_entropy = -np.inf

        for var in vars:
            entropy = AggregateSelector._entropy(prepared_log[var])

            if entropy > max_entropy:
                best_var = var
                max_entropy = entropy

        drop_list.extend([var for var in vars if var != best_var])

    return drop_list

mean(x)

Calculates the mean of a series, ignoring NA values.

Parameters:

Name Type Description Default
x Series

The series for which the mean will be calculated.

required

Returns:

Type Description
Optional[Series]

The mean of the series, or None if the series is all NA.

Source code in src/logos/aggimp/agg_funcs.py
def mean(x: pd.Series) -> Optional[pd.Series]:
    """
    Calculates the mean of a series, ignoring NA values.

    Parameters:
        x: The series for which the mean will be calculated.

    Returns:
        The mean of the series, or None if the series is all NA.
    """
    return x.mean(skipna=True) if x.isna().sum() < len(x) else None

min(x)

Calculates the minimum of a series, ignoring NA values.

Parameters:

Name Type Description Default
x Series

The series for which the minimum will be calculated.

required

Returns:

Type Description
Optional[Series]

The minimum of the series, or None if the series is all NA.

Source code in src/logos/aggimp/agg_funcs.py
def min(x: pd.Series) -> Optional[pd.Series]:
    """
    Calculates the minimum of a series, ignoring NA values.

    Parameters:
        x: The series for which the minimum will be calculated.

    Returns:
        The minimum of the series, or None if the series is all NA.
    """
    return x.min(skipna=True) if x.isna().sum() < len(x) else None

max(x)

Calculates the maximum of a series, ignoring NA values.

Parameters:

Name Type Description Default
x Series

The series for which the maximum will be calculated.

required

Returns:

Type Description
Optional[Series]

The maximum of the series, or None if the series is all NA.

Source code in src/logos/aggimp/agg_funcs.py
def max(x: pd.Series) -> Optional[pd.Series]:
    """
    Calculates the maximum of a series, ignoring NA values.

    Parameters:
        x: The series for which the maximum will be calculated.

    Returns:
        The maximum of the series, or None if the series is all NA.
    """
    return x.max(skipna=True) if x.isna().sum() < len(x) else None

median(x)

Calculates the median of a series, ignoring NA values.

Parameters:

Name Type Description Default
x Series

The series for which the median will be calculated.

required

Returns:

Type Description
Optional[Series]

The median of the series, or None if the series is all NA.

Source code in src/logos/aggimp/agg_funcs.py
def median(x: pd.Series) -> Optional[pd.Series]:
    """
    Calculates the median of a series, ignoring NA values.

    Parameters:
        x: The series for which the median will be calculated.

    Returns:
        The median of the series, or None if the series is all NA.
    """
    return x.median(skipna=True) if x.isna().sum() < len(x) else None

mode(x)

Calculates the mode of a series, ignoring NA values.

Parameters:

Name Type Description Default
x Series

The series for which the mode will be calculated.

required

Returns:

Type Description
Optional[Series]

The mode of the series, or None if the series is all NA.

Source code in src/logos/aggimp/agg_funcs.py
def mode(x: pd.Series) -> Optional[pd.Series]:
    """
    Calculates the mode of a series, ignoring NA values.

    Parameters:
        x: The series for which the mode will be calculated.

    Returns:
        The mode of the series, or None if the series is all NA.
    """
    return x.mode(dropna=True)[0] if x.isna().sum() < len(x) else None

std(x)

Calculates the standard deviation of a series, ignoring NA values.

Parameters:

Name Type Description Default
x Series

The series for which the standard deviation will be calculated.

required

Returns:

Type Description
Optional[Series]

The standard deviation of the series, or None if the series is all NA.

Source code in src/logos/aggimp/agg_funcs.py
def std(x: pd.Series) -> Optional[pd.Series]:
    """
    Calculates the standard deviation of a series, ignoring NA values.

    Parameters:
        x: The series for which the standard deviation will be calculated.

    Returns:
        The standard deviation of the series, or None if the series is all NA.
    """
    return x.std(skipna=True) if x.isna().sum() < len(x) else None

last(x)

Returns the last non-NA value in a series.

Parameters:

Name Type Description Default
x Series

The series for which the last non-NA value will be returned.

required

Returns:

Type Description
Optional[Series]

The last non-NA value of the series, or None if the series is all NA.

Source code in src/logos/aggimp/agg_funcs.py
def last(x: pd.Series) -> Optional[pd.Series]:
    """
    Returns the last non-NA value in a series.

    Parameters:
        x: The series for which the last non-NA value will be returned.

    Returns:
        The last non-NA value of the series, or None if the series is all NA.
    """
    return x.dropna().tail(1) if x.isna().sum() < len(x) else None

first(x)

Returns the first non-NA value in a series.

Parameters:

Name Type Description Default
x Series

The series for which the first non-NA value will be returned.

required

Returns:

Type Description
Optional[Series]

The first non-NA value of the series, or None if the series is all NA.

Source code in src/logos/aggimp/agg_funcs.py
def first(x: pd.Series) -> Optional[pd.Series]:
    """
    Returns the first non-NA value in a series.

    Parameters:
        x: The series for which the first non-NA value will be returned.

    Returns:
        The first non-NA value of the series, or None if the series is all NA.
    """
    return x.dropna().head(1) if x.isna().sum() < len(x) else None

sum(x)

Calculates the sum of a series, ignoring NA values.

Parameters:

Name Type Description Default
x Series

The series for which the sum will be calculated.

required

Returns:

Type Description
Optional[Series]

The sum of the series, or None if the series is all NA.

Source code in src/logos/aggimp/agg_funcs.py
def sum(x: pd.Series) -> Optional[pd.Series]:
    """
    Calculates the sum of a series, ignoring NA values.

    Parameters:
        x: The series for which the sum will be calculated.

    Returns:
        The sum of the series, or None if the series is all NA.
    """
    return x.sum(skipna=True) if x.isna().sum() < len(x) else None