Skip to content

TagUtils

TagOrigin

Bases: IntEnum

Source code in src/logos/tag_utils.py
class TagOrigin(IntEnum):
    PRECEDING: int = 0
    """Indicates that the tag was derived from the preceding tokens in the corresponding template."""

    GPT_3POINT5_TURBO: int = 1
    """Indicates that the tag was derived using gpt-3.5-turbo."""

    GPT_4: int = 2
    """Indicates that the tag was derived using gpt-4."""

    NAME: int = 3
    """Indicates that the tag was derived from the name of the variable."""

    REGEX_VARIABLE: int = 4
    """Indicates that the tag was derived from the name of the variable because the name was given by the user."""

PRECEDING: int = 0 class-attribute instance-attribute

Indicates that the tag was derived from the preceding tokens in the corresponding template.

GPT_3POINT5_TURBO: int = 1 class-attribute instance-attribute

Indicates that the tag was derived using gpt-3.5-turbo.

GPT_4: int = 2 class-attribute instance-attribute

Indicates that the tag was derived using gpt-4.

NAME: int = 3 class-attribute instance-attribute

Indicates that the tag was derived from the name of the variable.

REGEX_VARIABLE: int = 4 class-attribute instance-attribute

Indicates that the tag was derived from the name of the variable because the name was given by the user.

TagUtils

A class for managing tags of parsed and prepared variables.

Source code in src/logos/tag_utils.py
class TagUtils:
    """
    A class for managing tags of parsed and prepared variables.
    """

    @staticmethod
    def check_columns(df: pd.DataFrame, columns: list) -> None:
        """
        Check that the specified columns exist in the dataframe.

        Parameters:
            df: The dataframe to be checked.
            columns: The columns to be checked.

        Raises:
            ValueError: If any of the columns are not present in the dataframe.
        """
        if not set(columns).issubset(set(df.columns)):
            raise ValueError(f"Columns {columns} are not all present in the dataframe.")

    @staticmethod
    def check_fields(series: pd.Series, fields: list) -> None:
        """
        Check that the specified fields exist in the specified series.

        Parameters:
            series: The series to be checked.
            fields: The fields to be checked.

        Raises:
            ValueError: If any of the fields are not present in the series.
        """
        if not set(fields).issubset(set(series.index)):
            raise ValueError(f"Fields {fields} are not all present in the series.")

    @staticmethod
    def best_effort_tag(
        templates_df: pd.DataFrame,
        variable_row: pd.Series,
        enable_gpt_tagging: bool,
        gpt_model: str,
    ) -> tuple[str, bool]:
        """
        Apply `gpt_tag` to `variable_row`, if possible, and return the result. If there is
        no environment variable called OPENAI_API_KEY, or if `enable_gpt_tagging` is False,
        apply `preceding_tokens_tag` instead.

        Parameters:
            templates_df: The dataframe containing information about the log templates.
            variable_row: The row of the dataframe containing information about the parsed variable.
            enable_gpt_tagging: A boolean indicating whether GPT-3.5 tagging should be enabled.
            gpt_model: The GPT model to use.

        Returns:
            A tuple containing (i) the GPT-3.5 tag for the parsed variable name, if possible, or the
            best-effort tag otherwise, and (ii) a boolean indicating whether the GPT-3.5 tag was used.
        """
        if enable_gpt_tagging:
            try:
                return (TagUtils.gpt_tag(templates_df, variable_row, gpt_model), True)
            except:
                return (TagUtils.preceding_tokens_tag(variable_row), False)
        else:
            return (TagUtils.preceding_tokens_tag(variable_row), False)

    @staticmethod
    def waterfall_tag(
        templates_df: pd.DataFrame,
        variable_row: pd.Series,
        banned_values: Optional[list[str]] = None,
    ) -> tuple[str, TagOrigin]:
        """
        Apply each of the tagging methods in turn, in order of increasing cost, until a tag is found
        that is not included in the banned values. In partidular, apply `preceding_tokens_tag` first,
        then `gpt_tag` with the GPT-3.5 model, and finally `gpt_tag` with the GPT-4 model. If none of
        these methods succeeds, return the name of the variable as the tag.

        Parameters:
            templates_df: The dataframe containing information about the log templates.
            variable_row: The row of the dataframe containing information about the parsed variable.
            banned_values: A list of values that should not be used as tags.

        Returns:
            A tuple containing (i) the tag for the parsed variable, and (ii) the origin of the tag.
        """
        name = variable_row["Name"]
        if variable_row["From regex"]:
            return (name, TagOrigin.REGEX_VARIABLE)

        # Try to derive a tag from the preceding tokens in the corresponding template
        tag, origin = TagUtils.preceding_tokens_tag(variable_row, banned_values)
        if tag != name:
            return (tag, origin)

        # Try to derive a tag using GPT-3.5
        try:
            tag = TagUtils.gpt_tag(
                templates_df, variable_row, "gpt-3.5-turbo", banned_values
            )
            if tag != name:
                return (tag, TagOrigin.GPT_3POINT5_TURBO)
        except Exception as e:
            print(f"Exception {e} came up while tagging {name} with GPT-3.5.")
            pass

        # Try to derive a tag using GPT-4
        try:
            tag = TagUtils.gpt_tag(templates_df, variable_row, "gpt-4", banned_values)
            if tag != name:
                return (tag, TagOrigin.GPT_4)
        except Exception as e:
            print(f"Exception {e} came up while tagging {name} with GPT-4.")
            pass

        return (name, TagOrigin.NAME)

    @staticmethod
    def preceding_tokens_tag(
        variable_row: pd.Series, banned_values: Optional[list[str]] = None
    ) -> tuple[str, TagOrigin]:
        """
        Try to derive a tag for a parsed variable name based on the preceding tokens in the corresponding template.

        Parameters:
            variable_row: The row of the dataframe containing information about the parsed variable.
            banned_values: A list of values that should not be used as tags.

        Returns:
            A tuple containing (i) the tag for the parsed variable, and (ii) the origin of the tag.
        """

        TagUtils.check_fields(variable_row, ["Preceding 3 tokens", "Name", "From regex"])
        name = variable_row["Name"]
        if variable_row["From regex"]:
            return name, TagOrigin.REGEX_VARIABLE

        pr = variable_row["Preceding 3 tokens"]
        tag = name
        origin = TagOrigin.NAME
        if len(pr) >= 2 and (pr[-1] in ":=") and (pr[-2][0] != "<"):
            tag = pr[-2]
            origin = TagOrigin.PRECEDING
        elif (
            len(pr) == 3
            and (pr[2] in """"'""")
            and (pr[1] in ":=")
            and (pr[0][0] != "<")
        ):
            tag = pr[0]
            origin = TagOrigin.PRECEDING

        # Double-check that the tag is not in the banned values
        if banned_values is not None and tag in banned_values:
            return name, TagOrigin.NAME

        return tag, origin

    @staticmethod
    def gpt_tag(
        templates_df: pd.DataFrame,
        variable_row: pd.Series,
        model: str = "gpt-3.5-turbo",
        banned_values: Optional[list[str]] = None,
    ) -> str:
        """
        Use GPT to derive a tag the variable described in `variable_row`,
        using information about the corresponding log template, retrieved from `templates_df`.

        Parameters:
            templates_df: The dataframe containing information about the log templates.
            variable_row: The row of the dataframe containing information about the parsed variable.
            model: The GPT model to use.
            banned_values: A list of values that should not be used as tags.

        Returns:
            The GPT-generated tag for the parsed variable name.
        """

        TagUtils.check_fields(variable_row, ["Name", "Examples"])
        TagUtils.check_columns(templates_df, ["TemplateId", "TemplateExample"])

        template_id = ParsedVariableName(variable_row["Name"]).template_id()
        idx = ParsedVariableName(variable_row["Name"]).index()

        line = templates_df[templates_df["TemplateId"] == template_id][
            "TemplateExample"
        ].values[0]
        line_toks = line.split()

        # Define the messages to send to the model
        messages = [
            {
                "role": "system",
                "content": "You are a backend engineer that knows all about the logging infrastructure of a distributed system.",
            },
            {
                "role": "user",
                "content": f"""Generate a tag for the variable that takes the value {line_toks[idx]} """
                f"""in the following log line:\n {line}\n"""
                f"""Here are the 3 tokens that precede the variable: [{', '.join(line_toks[max(idx-3, 0):idx])} ]\n"""
                f"""Here are some more example values for this variable: [{', '.join(variable_row['Examples'])} ]\n"""
                #f"""Make sure the tag matches none of the following values: [{', '.join(banned_values) if banned_values is not None else ''} ]\n"""
                """Return only the tag as a single word, possibly including underscores. DO NOT EVER REPLY WITH MORE THAN ONE WORD.\n""",
            },
        ]

        client = OpenAI()

        tag = (
            client.chat.completions.create(model=model, messages=messages)
            .choices[0]
            .message.content
        )
        tag_length = len(tag.split())
        if tag_length > 1:
            # GPT didn't listen to us and returned a phrase describing the tag.
            # Extract the word between the second-last and last occurrence of double quotes.
            tag = tag.split('"')[-2]


        with open("gpt_log.txt", "a+") as f:
            f.write('----------------------------------\n')
            f.write(f"Variable name: {variable_row['Name']}\n\n")
            f.write(f"Model used: {model}\n\n")
            f.write(f"Messages sent to the model:\n{messages}\n\n")
            f.write(f"Tag generated by the model:\n{tag}\n\n")
            f.flush()

        # Double-check that the tag is not in the banned values
        if banned_values is not None and tag in banned_values:
            with open("gpt_log.txt", "a+") as f:
                f.write('That tag is banned, returning name.\n')
            return variable_row["Name"]

        return tag

    @staticmethod
    def deduplicate_tags(df: pd.DataFrame) -> pd.DataFrame:
        """
        Ensure that the tags in df are unique, by making the tag column of any row
        with a seen-before tag equal to the name column of that row.

        Parameters:
            df: The dataframe to be deduplicated.

        Returns:
            The deduplicated dataframe.
        """

        TagUtils.check_columns(df, ["Name", "Tag", "TagOrigin"])
        seen_tags = set()
        for i, row in df.iterrows():
            if row["Tag"] in seen_tags:
                df.loc[i, "Tag"] = row["Name"]
                df.loc[i, "TagOrigin"] = TagOrigin.NAME
            else:
                seen_tags.add(row["Tag"])

    @staticmethod
    def set_tag(df: pd.DataFrame, name: str, tag: str, info: str = "") -> None:
        """
        Tag a parsed or prepared variable for easier access.

        Parameters:
            df: The dataframe containing the parsed or prepared variables.
            name: The name of the parsed or prepared variable.
            tag: The tag to be set.
            info: A string describing the type of variable being tagged (parsed or prepared).

        Raises:
            ValueError: If the name is not the name of a parsed or prepared variable.
        """
        TagUtils.check_columns(df, ["Name", "Tag"])
        if name in df["Name"].values:
            df.loc[df["Name"] == name, "Tag"] = tag
            print(f"Variable {name} tagged as {tag}")
        else:
            raise ValueError(f"{name} is not the name of a {info} variable.")

    @staticmethod
    def get_tag(df: pd.DataFrame, name: str, info: str = "") -> str:
        """
        Retrieve the tag of a parsed or prepared variable.

        Parameters:
            df: The dataframe containing the parsed or prepared variables.
            name: The name of the parsed or prepared variable.
            info: A string describing the type of variable being tagged (parsed or prepared).

        Raises:
            ValueError: If the name is not the name of a parsed or prepared variable.
        """

        TagUtils.check_columns(df, ["Name", "Tag"])
        if name in df["Name"].values:
            return df.loc[df["Name"] == name, "Tag"].values[0]
        else:
            raise ValueError(f"{name} is not the name of a {info} variable.")

    @staticmethod
    def name_of(df: pd.DataFrame, name_or_tag: str, info: str = "") -> str:
        """
        Determine the name of a parsed or prepared variable, given either itself or its tag.

        Parameters:
            df: The dataframe containing the parsed or prepared variables.
            name_or_tag: The name or tag of the parsed or prepared variable.
            info: A string describing the type of variable in question (parsed or prepared).

        Returns:
            The name of the parsed or prepared variable.
        """

        TagUtils.check_columns(df, ["Name", "Tag"])
        name_or_tag = name_or_tag.strip()
        if name_or_tag in df["Name"].values:
            return name_or_tag
        elif name_or_tag in df["Tag"].values:
            return df.loc[df["Tag"] == name_or_tag, "Name"].values[0]
        else:
            raise ValueError(
                f"{name_or_tag} is not the name or tag of a {info} variable."
            )

    @staticmethod
    def tag_of(df: pd.DataFrame, name_or_tag: Optional[str], info: str = "") -> Optional[str]:
        """
        Determine the tag of a parsed or prepared variable, given either itself or its name.
        Retuirn None if the variable is None.

        Parameters:
            df: The dataframe containing the parsed or prepared variables.
            name_or_tag: The name or tag of the parsed or prepared variable.
            info: A string describing the type of variable in question (parsed or prepared).

        Returns:
            The tag of the parsed or prepared variable.
        """

        if name_or_tag is None:
            return None

        TagUtils.check_columns(df, ["Name", "Tag"])
        name_or_tag = name_or_tag.strip()
        if name_or_tag in df["Tag"].values:
            return name_or_tag
        elif name_or_tag in df["Name"].values:
            return df.loc[df["Name"] == name_or_tag, "Tag"].values[0]
        else:
            raise ValueError(
                f"{name_or_tag} is not the name or tag of a {info} variable."
            )

check_columns(df, columns) staticmethod

Check that the specified columns exist in the dataframe.

Parameters:

Name Type Description Default
df DataFrame

The dataframe to be checked.

required
columns list

The columns to be checked.

required

Raises:

Type Description
ValueError

If any of the columns are not present in the dataframe.

Source code in src/logos/tag_utils.py
@staticmethod
def check_columns(df: pd.DataFrame, columns: list) -> None:
    """
    Check that the specified columns exist in the dataframe.

    Parameters:
        df: The dataframe to be checked.
        columns: The columns to be checked.

    Raises:
        ValueError: If any of the columns are not present in the dataframe.
    """
    if not set(columns).issubset(set(df.columns)):
        raise ValueError(f"Columns {columns} are not all present in the dataframe.")

check_fields(series, fields) staticmethod

Check that the specified fields exist in the specified series.

Parameters:

Name Type Description Default
series Series

The series to be checked.

required
fields list

The fields to be checked.

required

Raises:

Type Description
ValueError

If any of the fields are not present in the series.

Source code in src/logos/tag_utils.py
@staticmethod
def check_fields(series: pd.Series, fields: list) -> None:
    """
    Check that the specified fields exist in the specified series.

    Parameters:
        series: The series to be checked.
        fields: The fields to be checked.

    Raises:
        ValueError: If any of the fields are not present in the series.
    """
    if not set(fields).issubset(set(series.index)):
        raise ValueError(f"Fields {fields} are not all present in the series.")

best_effort_tag(templates_df, variable_row, enable_gpt_tagging, gpt_model) staticmethod

Apply gpt_tag to variable_row, if possible, and return the result. If there is no environment variable called OPENAI_API_KEY, or if enable_gpt_tagging is False, apply preceding_tokens_tag instead.

Parameters:

Name Type Description Default
templates_df DataFrame

The dataframe containing information about the log templates.

required
variable_row Series

The row of the dataframe containing information about the parsed variable.

required
enable_gpt_tagging bool

A boolean indicating whether GPT-3.5 tagging should be enabled.

required
gpt_model str

The GPT model to use.

required

Returns:

Type Description
str

A tuple containing (i) the GPT-3.5 tag for the parsed variable name, if possible, or the

bool

best-effort tag otherwise, and (ii) a boolean indicating whether the GPT-3.5 tag was used.

Source code in src/logos/tag_utils.py
@staticmethod
def best_effort_tag(
    templates_df: pd.DataFrame,
    variable_row: pd.Series,
    enable_gpt_tagging: bool,
    gpt_model: str,
) -> tuple[str, bool]:
    """
    Apply `gpt_tag` to `variable_row`, if possible, and return the result. If there is
    no environment variable called OPENAI_API_KEY, or if `enable_gpt_tagging` is False,
    apply `preceding_tokens_tag` instead.

    Parameters:
        templates_df: The dataframe containing information about the log templates.
        variable_row: The row of the dataframe containing information about the parsed variable.
        enable_gpt_tagging: A boolean indicating whether GPT-3.5 tagging should be enabled.
        gpt_model: The GPT model to use.

    Returns:
        A tuple containing (i) the GPT-3.5 tag for the parsed variable name, if possible, or the
        best-effort tag otherwise, and (ii) a boolean indicating whether the GPT-3.5 tag was used.
    """
    if enable_gpt_tagging:
        try:
            return (TagUtils.gpt_tag(templates_df, variable_row, gpt_model), True)
        except:
            return (TagUtils.preceding_tokens_tag(variable_row), False)
    else:
        return (TagUtils.preceding_tokens_tag(variable_row), False)

waterfall_tag(templates_df, variable_row, banned_values=None) staticmethod

Apply each of the tagging methods in turn, in order of increasing cost, until a tag is found that is not included in the banned values. In partidular, apply preceding_tokens_tag first, then gpt_tag with the GPT-3.5 model, and finally gpt_tag with the GPT-4 model. If none of these methods succeeds, return the name of the variable as the tag.

Parameters:

Name Type Description Default
templates_df DataFrame

The dataframe containing information about the log templates.

required
variable_row Series

The row of the dataframe containing information about the parsed variable.

required
banned_values Optional[list[str]]

A list of values that should not be used as tags.

None

Returns:

Type Description
tuple[str, TagOrigin]

A tuple containing (i) the tag for the parsed variable, and (ii) the origin of the tag.

Source code in src/logos/tag_utils.py
@staticmethod
def waterfall_tag(
    templates_df: pd.DataFrame,
    variable_row: pd.Series,
    banned_values: Optional[list[str]] = None,
) -> tuple[str, TagOrigin]:
    """
    Apply each of the tagging methods in turn, in order of increasing cost, until a tag is found
    that is not included in the banned values. In partidular, apply `preceding_tokens_tag` first,
    then `gpt_tag` with the GPT-3.5 model, and finally `gpt_tag` with the GPT-4 model. If none of
    these methods succeeds, return the name of the variable as the tag.

    Parameters:
        templates_df: The dataframe containing information about the log templates.
        variable_row: The row of the dataframe containing information about the parsed variable.
        banned_values: A list of values that should not be used as tags.

    Returns:
        A tuple containing (i) the tag for the parsed variable, and (ii) the origin of the tag.
    """
    name = variable_row["Name"]
    if variable_row["From regex"]:
        return (name, TagOrigin.REGEX_VARIABLE)

    # Try to derive a tag from the preceding tokens in the corresponding template
    tag, origin = TagUtils.preceding_tokens_tag(variable_row, banned_values)
    if tag != name:
        return (tag, origin)

    # Try to derive a tag using GPT-3.5
    try:
        tag = TagUtils.gpt_tag(
            templates_df, variable_row, "gpt-3.5-turbo", banned_values
        )
        if tag != name:
            return (tag, TagOrigin.GPT_3POINT5_TURBO)
    except Exception as e:
        print(f"Exception {e} came up while tagging {name} with GPT-3.5.")
        pass

    # Try to derive a tag using GPT-4
    try:
        tag = TagUtils.gpt_tag(templates_df, variable_row, "gpt-4", banned_values)
        if tag != name:
            return (tag, TagOrigin.GPT_4)
    except Exception as e:
        print(f"Exception {e} came up while tagging {name} with GPT-4.")
        pass

    return (name, TagOrigin.NAME)

preceding_tokens_tag(variable_row, banned_values=None) staticmethod

Try to derive a tag for a parsed variable name based on the preceding tokens in the corresponding template.

Parameters:

Name Type Description Default
variable_row Series

The row of the dataframe containing information about the parsed variable.

required
banned_values Optional[list[str]]

A list of values that should not be used as tags.

None

Returns:

Type Description
tuple[str, TagOrigin]

A tuple containing (i) the tag for the parsed variable, and (ii) the origin of the tag.

Source code in src/logos/tag_utils.py
@staticmethod
def preceding_tokens_tag(
    variable_row: pd.Series, banned_values: Optional[list[str]] = None
) -> tuple[str, TagOrigin]:
    """
    Try to derive a tag for a parsed variable name based on the preceding tokens in the corresponding template.

    Parameters:
        variable_row: The row of the dataframe containing information about the parsed variable.
        banned_values: A list of values that should not be used as tags.

    Returns:
        A tuple containing (i) the tag for the parsed variable, and (ii) the origin of the tag.
    """

    TagUtils.check_fields(variable_row, ["Preceding 3 tokens", "Name", "From regex"])
    name = variable_row["Name"]
    if variable_row["From regex"]:
        return name, TagOrigin.REGEX_VARIABLE

    pr = variable_row["Preceding 3 tokens"]
    tag = name
    origin = TagOrigin.NAME
    if len(pr) >= 2 and (pr[-1] in ":=") and (pr[-2][0] != "<"):
        tag = pr[-2]
        origin = TagOrigin.PRECEDING
    elif (
        len(pr) == 3
        and (pr[2] in """"'""")
        and (pr[1] in ":=")
        and (pr[0][0] != "<")
    ):
        tag = pr[0]
        origin = TagOrigin.PRECEDING

    # Double-check that the tag is not in the banned values
    if banned_values is not None and tag in banned_values:
        return name, TagOrigin.NAME

    return tag, origin

gpt_tag(templates_df, variable_row, model='gpt-3.5-turbo', banned_values=None) staticmethod

Use GPT to derive a tag the variable described in variable_row, using information about the corresponding log template, retrieved from templates_df.

Parameters:

Name Type Description Default
templates_df DataFrame

The dataframe containing information about the log templates.

required
variable_row Series

The row of the dataframe containing information about the parsed variable.

required
model str

The GPT model to use.

'gpt-3.5-turbo'
banned_values Optional[list[str]]

A list of values that should not be used as tags.

None

Returns:

Type Description
str

The GPT-generated tag for the parsed variable name.

Source code in src/logos/tag_utils.py
@staticmethod
def gpt_tag(
    templates_df: pd.DataFrame,
    variable_row: pd.Series,
    model: str = "gpt-3.5-turbo",
    banned_values: Optional[list[str]] = None,
) -> str:
    """
    Use GPT to derive a tag the variable described in `variable_row`,
    using information about the corresponding log template, retrieved from `templates_df`.

    Parameters:
        templates_df: The dataframe containing information about the log templates.
        variable_row: The row of the dataframe containing information about the parsed variable.
        model: The GPT model to use.
        banned_values: A list of values that should not be used as tags.

    Returns:
        The GPT-generated tag for the parsed variable name.
    """

    TagUtils.check_fields(variable_row, ["Name", "Examples"])
    TagUtils.check_columns(templates_df, ["TemplateId", "TemplateExample"])

    template_id = ParsedVariableName(variable_row["Name"]).template_id()
    idx = ParsedVariableName(variable_row["Name"]).index()

    line = templates_df[templates_df["TemplateId"] == template_id][
        "TemplateExample"
    ].values[0]
    line_toks = line.split()

    # Define the messages to send to the model
    messages = [
        {
            "role": "system",
            "content": "You are a backend engineer that knows all about the logging infrastructure of a distributed system.",
        },
        {
            "role": "user",
            "content": f"""Generate a tag for the variable that takes the value {line_toks[idx]} """
            f"""in the following log line:\n {line}\n"""
            f"""Here are the 3 tokens that precede the variable: [{', '.join(line_toks[max(idx-3, 0):idx])} ]\n"""
            f"""Here are some more example values for this variable: [{', '.join(variable_row['Examples'])} ]\n"""
            #f"""Make sure the tag matches none of the following values: [{', '.join(banned_values) if banned_values is not None else ''} ]\n"""
            """Return only the tag as a single word, possibly including underscores. DO NOT EVER REPLY WITH MORE THAN ONE WORD.\n""",
        },
    ]

    client = OpenAI()

    tag = (
        client.chat.completions.create(model=model, messages=messages)
        .choices[0]
        .message.content
    )
    tag_length = len(tag.split())
    if tag_length > 1:
        # GPT didn't listen to us and returned a phrase describing the tag.
        # Extract the word between the second-last and last occurrence of double quotes.
        tag = tag.split('"')[-2]


    with open("gpt_log.txt", "a+") as f:
        f.write('----------------------------------\n')
        f.write(f"Variable name: {variable_row['Name']}\n\n")
        f.write(f"Model used: {model}\n\n")
        f.write(f"Messages sent to the model:\n{messages}\n\n")
        f.write(f"Tag generated by the model:\n{tag}\n\n")
        f.flush()

    # Double-check that the tag is not in the banned values
    if banned_values is not None and tag in banned_values:
        with open("gpt_log.txt", "a+") as f:
            f.write('That tag is banned, returning name.\n')
        return variable_row["Name"]

    return tag

deduplicate_tags(df) staticmethod

Ensure that the tags in df are unique, by making the tag column of any row with a seen-before tag equal to the name column of that row.

Parameters:

Name Type Description Default
df DataFrame

The dataframe to be deduplicated.

required

Returns:

Type Description
DataFrame

The deduplicated dataframe.

Source code in src/logos/tag_utils.py
@staticmethod
def deduplicate_tags(df: pd.DataFrame) -> pd.DataFrame:
    """
    Ensure that the tags in df are unique, by making the tag column of any row
    with a seen-before tag equal to the name column of that row.

    Parameters:
        df: The dataframe to be deduplicated.

    Returns:
        The deduplicated dataframe.
    """

    TagUtils.check_columns(df, ["Name", "Tag", "TagOrigin"])
    seen_tags = set()
    for i, row in df.iterrows():
        if row["Tag"] in seen_tags:
            df.loc[i, "Tag"] = row["Name"]
            df.loc[i, "TagOrigin"] = TagOrigin.NAME
        else:
            seen_tags.add(row["Tag"])

set_tag(df, name, tag, info='') staticmethod

Tag a parsed or prepared variable for easier access.

Parameters:

Name Type Description Default
df DataFrame

The dataframe containing the parsed or prepared variables.

required
name str

The name of the parsed or prepared variable.

required
tag str

The tag to be set.

required
info str

A string describing the type of variable being tagged (parsed or prepared).

''

Raises:

Type Description
ValueError

If the name is not the name of a parsed or prepared variable.

Source code in src/logos/tag_utils.py
@staticmethod
def set_tag(df: pd.DataFrame, name: str, tag: str, info: str = "") -> None:
    """
    Tag a parsed or prepared variable for easier access.

    Parameters:
        df: The dataframe containing the parsed or prepared variables.
        name: The name of the parsed or prepared variable.
        tag: The tag to be set.
        info: A string describing the type of variable being tagged (parsed or prepared).

    Raises:
        ValueError: If the name is not the name of a parsed or prepared variable.
    """
    TagUtils.check_columns(df, ["Name", "Tag"])
    if name in df["Name"].values:
        df.loc[df["Name"] == name, "Tag"] = tag
        print(f"Variable {name} tagged as {tag}")
    else:
        raise ValueError(f"{name} is not the name of a {info} variable.")

get_tag(df, name, info='') staticmethod

Retrieve the tag of a parsed or prepared variable.

Parameters:

Name Type Description Default
df DataFrame

The dataframe containing the parsed or prepared variables.

required
name str

The name of the parsed or prepared variable.

required
info str

A string describing the type of variable being tagged (parsed or prepared).

''

Raises:

Type Description
ValueError

If the name is not the name of a parsed or prepared variable.

Source code in src/logos/tag_utils.py
@staticmethod
def get_tag(df: pd.DataFrame, name: str, info: str = "") -> str:
    """
    Retrieve the tag of a parsed or prepared variable.

    Parameters:
        df: The dataframe containing the parsed or prepared variables.
        name: The name of the parsed or prepared variable.
        info: A string describing the type of variable being tagged (parsed or prepared).

    Raises:
        ValueError: If the name is not the name of a parsed or prepared variable.
    """

    TagUtils.check_columns(df, ["Name", "Tag"])
    if name in df["Name"].values:
        return df.loc[df["Name"] == name, "Tag"].values[0]
    else:
        raise ValueError(f"{name} is not the name of a {info} variable.")

name_of(df, name_or_tag, info='') staticmethod

Determine the name of a parsed or prepared variable, given either itself or its tag.

Parameters:

Name Type Description Default
df DataFrame

The dataframe containing the parsed or prepared variables.

required
name_or_tag str

The name or tag of the parsed or prepared variable.

required
info str

A string describing the type of variable in question (parsed or prepared).

''

Returns:

Type Description
str

The name of the parsed or prepared variable.

Source code in src/logos/tag_utils.py
@staticmethod
def name_of(df: pd.DataFrame, name_or_tag: str, info: str = "") -> str:
    """
    Determine the name of a parsed or prepared variable, given either itself or its tag.

    Parameters:
        df: The dataframe containing the parsed or prepared variables.
        name_or_tag: The name or tag of the parsed or prepared variable.
        info: A string describing the type of variable in question (parsed or prepared).

    Returns:
        The name of the parsed or prepared variable.
    """

    TagUtils.check_columns(df, ["Name", "Tag"])
    name_or_tag = name_or_tag.strip()
    if name_or_tag in df["Name"].values:
        return name_or_tag
    elif name_or_tag in df["Tag"].values:
        return df.loc[df["Tag"] == name_or_tag, "Name"].values[0]
    else:
        raise ValueError(
            f"{name_or_tag} is not the name or tag of a {info} variable."
        )

tag_of(df, name_or_tag, info='') staticmethod

Determine the tag of a parsed or prepared variable, given either itself or its name. Retuirn None if the variable is None.

Parameters:

Name Type Description Default
df DataFrame

The dataframe containing the parsed or prepared variables.

required
name_or_tag Optional[str]

The name or tag of the parsed or prepared variable.

required
info str

A string describing the type of variable in question (parsed or prepared).

''

Returns:

Type Description
Optional[str]

The tag of the parsed or prepared variable.

Source code in src/logos/tag_utils.py
@staticmethod
def tag_of(df: pd.DataFrame, name_or_tag: Optional[str], info: str = "") -> Optional[str]:
    """
    Determine the tag of a parsed or prepared variable, given either itself or its name.
    Retuirn None if the variable is None.

    Parameters:
        df: The dataframe containing the parsed or prepared variables.
        name_or_tag: The name or tag of the parsed or prepared variable.
        info: A string describing the type of variable in question (parsed or prepared).

    Returns:
        The tag of the parsed or prepared variable.
    """

    if name_or_tag is None:
        return None

    TagUtils.check_columns(df, ["Name", "Tag"])
    name_or_tag = name_or_tag.strip()
    if name_or_tag in df["Tag"].values:
        return name_or_tag
    elif name_or_tag in df["Name"].values:
        return df.loc[df["Name"] == name_or_tag, "Tag"].values[0]
    else:
        raise ValueError(
            f"{name_or_tag} is not the name or tag of a {info} variable."
        )