AggregateSelector

`AggregateSelector`

Source code in src/logos/aggregate_selector.py

class AggregateSelector:
    DEFAULT_AGGREGATES = {
        "num": [
            "mean",
            "max",
            "min",
        ],
        "str": [
            "last",
            "mode",
            "first",
        ],
    }

    def _entropy(col: pd.Series) -> float:
        """
        Calculates the entropy of a column.

        Parameters:
            col: The column for which to calculate the entropy.

        Returns:
            The entropy of `col`.
        """

        rel_value_counts = col.value_counts(normalize=True)
        if rel_value_counts.empty:
            return 0
        return -np.sum(rel_value_counts * np.log2(rel_value_counts))

    def find_uninformative_aggregates(
        prepared_log: pd.DataFrame, parsed_variables: pd.DataFrame, causal_unit_var: str
    ) -> list[str]:
        """
        Find aggregates that are uninformative for each column in `prepared_log`.
        Aggregates are uninformative unless they maximize the empirical entropy across causal units.

        Parameters:
            prepared_log: The prepared log.
            parsed_variables: The parsed variables.
            causal_unit_var: The name of the causal unit variable.

        Returns:
            A list of uninformative aggregates for `prepared_log`.
        """

        drop_list = []

        for row in parsed_variables.itertuples():
            aggs = row.Aggregates
            if len(aggs) == 0 or row.Name == causal_unit_var:
                continue

            vars = [f"{row.Name}+{agg}" for agg in aggs]
            best_var = f"{row.Name}+{AggregateSelector.DEFAULT_AGGREGATES[row.Type][0]}"
            max_entropy = -np.inf

            for var in vars:
                entropy = AggregateSelector._entropy(prepared_log[var])

                if entropy > max_entropy:
                    best_var = var
                    max_entropy = entropy

            drop_list.extend([var for var in vars if var != best_var])

        return drop_list

`_entropy(col)`

Calculates the entropy of a column.

Parameters:

Name	Type	Description	Default
`col`	`Series`	The column for which to calculate the entropy.	required

Returns:

Type	Description
`float`	The entropy of `col`.

Source code in src/logos/aggregate_selector.py

def _entropy(col: pd.Series) -> float:
    """
    Calculates the entropy of a column.

    Parameters:
        col: The column for which to calculate the entropy.

    Returns:
        The entropy of `col`.
    """

    rel_value_counts = col.value_counts(normalize=True)
    if rel_value_counts.empty:
        return 0
    return -np.sum(rel_value_counts * np.log2(rel_value_counts))

`find_uninformative_aggregates(prepared_log, parsed_variables, causal_unit_var)`

Find aggregates that are uninformative for each column in prepared_log. Aggregates are uninformative unless they maximize the empirical entropy across causal units.

Parameters:

Name	Type	Description	Default
`prepared_log`	`DataFrame`	The prepared log.	required
`parsed_variables`	`DataFrame`	The parsed variables.	required
`causal_unit_var`	`str`	The name of the causal unit variable.	required

Returns:

Type	Description
`list[str]`	A list of uninformative aggregates for `prepared_log`.

Source code in src/logos/aggregate_selector.py

def find_uninformative_aggregates(
    prepared_log: pd.DataFrame, parsed_variables: pd.DataFrame, causal_unit_var: str
) -> list[str]:
    """
    Find aggregates that are uninformative for each column in `prepared_log`.
    Aggregates are uninformative unless they maximize the empirical entropy across causal units.

    Parameters:
        prepared_log: The prepared log.
        parsed_variables: The parsed variables.
        causal_unit_var: The name of the causal unit variable.

    Returns:
        A list of uninformative aggregates for `prepared_log`.
    """

    drop_list = []

    for row in parsed_variables.itertuples():
        aggs = row.Aggregates
        if len(aggs) == 0 or row.Name == causal_unit_var:
            continue

        vars = [f"{row.Name}+{agg}" for agg in aggs]
        best_var = f"{row.Name}+{AggregateSelector.DEFAULT_AGGREGATES[row.Type][0]}"
        max_entropy = -np.inf

        for var in vars:
            entropy = AggregateSelector._entropy(prepared_log[var])

            if entropy > max_entropy:
                best_var = var
                max_entropy = entropy

        drop_list.extend([var for var in vars if var != best_var])

    return drop_list

`mean(x)`

Calculates the mean of a series, ignoring NA values.

Parameters:

Name	Type	Description	Default
`x`	`Series`	The series for which the mean will be calculated.	required

Returns:

Type	Description
`Optional[Series]`	The mean of the series, or None if the series is all NA.

Source code in src/logos/aggimp/agg_funcs.py

def mean(x: pd.Series) -> Optional[pd.Series]:
    """
    Calculates the mean of a series, ignoring NA values.

    Parameters:
        x: The series for which the mean will be calculated.

    Returns:
        The mean of the series, or None if the series is all NA.
    """
    return x.mean(skipna=True) if x.isna().sum() < len(x) else None

`min(x)`

Calculates the minimum of a series, ignoring NA values.

Parameters:

Name	Type	Description	Default
`x`	`Series`	The series for which the minimum will be calculated.	required

Returns:

Type	Description
`Optional[Series]`	The minimum of the series, or None if the series is all NA.

Source code in src/logos/aggimp/agg_funcs.py

def min(x: pd.Series) -> Optional[pd.Series]:
    """
    Calculates the minimum of a series, ignoring NA values.

    Parameters:
        x: The series for which the minimum will be calculated.

    Returns:
        The minimum of the series, or None if the series is all NA.
    """
    return x.min(skipna=True) if x.isna().sum() < len(x) else None

`max(x)`

Calculates the maximum of a series, ignoring NA values.

Parameters:

Name	Type	Description	Default
`x`	`Series`	The series for which the maximum will be calculated.	required

Returns:

Type	Description
`Optional[Series]`	The maximum of the series, or None if the series is all NA.

Source code in src/logos/aggimp/agg_funcs.py

def max(x: pd.Series) -> Optional[pd.Series]:
    """
    Calculates the maximum of a series, ignoring NA values.

    Parameters:
        x: The series for which the maximum will be calculated.

    Returns:
        The maximum of the series, or None if the series is all NA.
    """
    return x.max(skipna=True) if x.isna().sum() < len(x) else None

`median(x)`

Calculates the median of a series, ignoring NA values.

Parameters:

Name	Type	Description	Default
`x`	`Series`	The series for which the median will be calculated.	required

Returns:

Type	Description
`Optional[Series]`	The median of the series, or None if the series is all NA.

Source code in src/logos/aggimp/agg_funcs.py

def median(x: pd.Series) -> Optional[pd.Series]:
    """
    Calculates the median of a series, ignoring NA values.

    Parameters:
        x: The series for which the median will be calculated.

    Returns:
        The median of the series, or None if the series is all NA.
    """
    return x.median(skipna=True) if x.isna().sum() < len(x) else None

`mode(x)`

Calculates the mode of a series, ignoring NA values.

Parameters:

Name	Type	Description	Default
`x`	`Series`	The series for which the mode will be calculated.	required

Returns:

Type	Description
`Optional[Series]`	The mode of the series, or None if the series is all NA.

Source code in src/logos/aggimp/agg_funcs.py

def mode(x: pd.Series) -> Optional[pd.Series]:
    """
    Calculates the mode of a series, ignoring NA values.

    Parameters:
        x: The series for which the mode will be calculated.

    Returns:
        The mode of the series, or None if the series is all NA.
    """
    return x.mode(dropna=True)[0] if x.isna().sum() < len(x) else None

`std(x)`

Calculates the standard deviation of a series, ignoring NA values.

Parameters:

Name	Type	Description	Default
`x`	`Series`	The series for which the standard deviation will be calculated.	required

Returns:

Type	Description
`Optional[Series]`	The standard deviation of the series, or None if the series is all NA.

Source code in src/logos/aggimp/agg_funcs.py

def std(x: pd.Series) -> Optional[pd.Series]:
    """
    Calculates the standard deviation of a series, ignoring NA values.

    Parameters:
        x: The series for which the standard deviation will be calculated.

    Returns:
        The standard deviation of the series, or None if the series is all NA.
    """
    return x.std(skipna=True) if x.isna().sum() < len(x) else None

`last(x)`

Returns the last non-NA value in a series.

Parameters:

Name	Type	Description	Default
`x`	`Series`	The series for which the last non-NA value will be returned.	required

Returns:

Type	Description
`Optional[Series]`	The last non-NA value of the series, or None if the series is all NA.

Source code in src/logos/aggimp/agg_funcs.py

def last(x: pd.Series) -> Optional[pd.Series]:
    """
    Returns the last non-NA value in a series.

    Parameters:
        x: The series for which the last non-NA value will be returned.

    Returns:
        The last non-NA value of the series, or None if the series is all NA.
    """
    return x.dropna().tail(1) if x.isna().sum() < len(x) else None

`first(x)`

Returns the first non-NA value in a series.

Parameters:

Name	Type	Description	Default
`x`	`Series`	The series for which the first non-NA value will be returned.	required

Returns:

Type	Description
`Optional[Series]`	The first non-NA value of the series, or None if the series is all NA.

Source code in src/logos/aggimp/agg_funcs.py

def first(x: pd.Series) -> Optional[pd.Series]:
    """
    Returns the first non-NA value in a series.

    Parameters:
        x: The series for which the first non-NA value will be returned.

    Returns:
        The first non-NA value of the series, or None if the series is all NA.
    """
    return x.dropna().head(1) if x.isna().sum() < len(x) else None

`sum(x)`

Calculates the sum of a series, ignoring NA values.

Parameters:

Name	Type	Description	Default
`x`	`Series`	The series for which the sum will be calculated.	required

Returns:

Type	Description
`Optional[Series]`	The sum of the series, or None if the series is all NA.

Source code in src/logos/aggimp/agg_funcs.py

def sum(x: pd.Series) -> Optional[pd.Series]:
    """
    Calculates the sum of a series, ignoring NA values.

    Parameters:
        x: The series for which the sum will be calculated.

    Returns:
        The sum of the series, or None if the series is all NA.
    """
    return x.sum(skipna=True) if x.isna().sum() < len(x) else None