TagUtils

`TagOrigin`

Bases: IntEnum

Source code in src/logos/tag_utils.py

class TagOrigin(IntEnum):
    PRECEDING: int = 0
    """Indicates that the tag was derived from the preceding tokens in the corresponding template."""

    GPT_3POINT5_TURBO: int = 1
    """Indicates that the tag was derived using gpt-3.5-turbo."""

    GPT_4: int = 2
    """Indicates that the tag was derived using gpt-4."""

    NAME: int = 3
    """Indicates that the tag was derived from the name of the variable."""

    REGEX_VARIABLE: int = 4
    """Indicates that the tag was derived from the name of the variable because the name was given by the user."""

`PRECEDING: int = 0` `class-attribute` `instance-attribute`

Indicates that the tag was derived from the preceding tokens in the corresponding template.

`GPT_3POINT5_TURBO: int = 1` `class-attribute` `instance-attribute`

Indicates that the tag was derived using gpt-3.5-turbo.

`GPT_4: int = 2` `class-attribute` `instance-attribute`

Indicates that the tag was derived using gpt-4.

`NAME: int = 3` `class-attribute` `instance-attribute`

Indicates that the tag was derived from the name of the variable.

`REGEX_VARIABLE: int = 4` `class-attribute` `instance-attribute`

Indicates that the tag was derived from the name of the variable because the name was given by the user.

`TagUtils`

A class for managing tags of parsed and prepared variables.

Source code in src/logos/tag_utils.py

class TagUtils:
    """
    A class for managing tags of parsed and prepared variables.
    """

    @staticmethod
    def check_columns(df: pd.DataFrame, columns: list) -> None:
        """
        Check that the specified columns exist in the dataframe.

        Parameters:
            df: The dataframe to be checked.
            columns: The columns to be checked.

        Raises:
            ValueError: If any of the columns are not present in the dataframe.
        """
        if not set(columns).issubset(set(df.columns)):
            raise ValueError(f"Columns {columns} are not all present in the dataframe.")

    @staticmethod
    def check_fields(series: pd.Series, fields: list) -> None:
        """
        Check that the specified fields exist in the specified series.

        Parameters:
            series: The series to be checked.
            fields: The fields to be checked.

        Raises:
            ValueError: If any of the fields are not present in the series.
        """
        if not set(fields).issubset(set(series.index)):
            raise ValueError(f"Fields {fields} are not all present in the series.")

    @staticmethod
    def best_effort_tag(
        templates_df: pd.DataFrame,
        variable_row: pd.Series,
        enable_gpt_tagging: bool,
        gpt_model: str,
    ) -> tuple[str, bool]:
        """
        Apply `gpt_tag` to `variable_row`, if possible, and return the result. If there is
        no environment variable called OPENAI_API_KEY, or if `enable_gpt_tagging` is False,
        apply `preceding_tokens_tag` instead.

        Parameters:
            templates_df: The dataframe containing information about the log templates.
            variable_row: The row of the dataframe containing information about the parsed variable.
            enable_gpt_tagging: A boolean indicating whether GPT-3.5 tagging should be enabled.
            gpt_model: The GPT model to use.

        Returns:
            A tuple containing (i) the GPT-3.5 tag for the parsed variable name, if possible, or the
            best-effort tag otherwise, and (ii) a boolean indicating whether the GPT-3.5 tag was used.
        """
        if enable_gpt_tagging:
            try:
                return (TagUtils.gpt_tag(templates_df, variable_row, gpt_model), True)
            except:
                return (TagUtils.preceding_tokens_tag(variable_row), False)
        else:
            return (TagUtils.preceding_tokens_tag(variable_row), False)

    @staticmethod
    def waterfall_tag(
        templates_df: pd.DataFrame,
        variable_row: pd.Series,
        banned_values: Optional[list[str]] = None,
    ) -> tuple[str, TagOrigin]:
        """
        Apply each of the tagging methods in turn, in order of increasing cost, until a tag is found
        that is not included in the banned values. In partidular, apply `preceding_tokens_tag` first,
        then `gpt_tag` with the GPT-3.5 model, and finally `gpt_tag` with the GPT-4 model. If none of
        these methods succeeds, return the name of the variable as the tag.

        Parameters:
            templates_df: The dataframe containing information about the log templates.
            variable_row: The row of the dataframe containing information about the parsed variable.
            banned_values: A list of values that should not be used as tags.

        Returns:
            A tuple containing (i) the tag for the parsed variable, and (ii) the origin of the tag.
        """
        name = variable_row["Name"]
        if variable_row["From regex"]:
            return (name, TagOrigin.REGEX_VARIABLE)

        # Try to derive a tag from the preceding tokens in the corresponding template
        tag, origin = TagUtils.preceding_tokens_tag(variable_row, banned_values)
        if tag != name:
            return (tag, origin)

        # Try to derive a tag using GPT-3.5
        try:
            tag = TagUtils.gpt_tag(
                templates_df, variable_row, "gpt-3.5-turbo", banned_values
            )
            if tag != name:
                return (tag, TagOrigin.GPT_3POINT5_TURBO)
        except Exception as e:
            print(f"Exception {e} came up while tagging {name} with GPT-3.5.")
            pass

        # Try to derive a tag using GPT-4
        try:
            tag = TagUtils.gpt_tag(templates_df, variable_row, "gpt-4", banned_values)
            if tag != name:
                return (tag, TagOrigin.GPT_4)
        except Exception as e:
            print(f"Exception {e} came up while tagging {name} with GPT-4.")
            pass

        return (name, TagOrigin.NAME)

    @staticmethod
    def preceding_tokens_tag(
        variable_row: pd.Series, banned_values: Optional[list[str]] = None
    ) -> tuple[str, TagOrigin]:
        """
        Try to derive a tag for a parsed variable name based on the preceding tokens in the corresponding template.

        Parameters:
            variable_row: The row of the dataframe containing information about the parsed variable.
            banned_values: A list of values that should not be used as tags.

        Returns:
            A tuple containing (i) the tag for the parsed variable, and (ii) the origin of the tag.
        """

        TagUtils.check_fields(variable_row, ["Preceding 3 tokens", "Name", "From regex"])
        name = variable_row["Name"]
        if variable_row["From regex"]:
            return name, TagOrigin.REGEX_VARIABLE

        pr = variable_row["Preceding 3 tokens"]
        tag = name
        origin = TagOrigin.NAME
        if len(pr) >= 2 and (pr[-1] in ":=") and (pr[-2][0] != "<"):
            tag = pr[-2]
            origin = TagOrigin.PRECEDING
        elif (
            len(pr) == 3
            and (pr[2] in """"'""")
            and (pr[1] in ":=")
            and (pr[0][0] != "<")
        ):
            tag = pr[0]
            origin = TagOrigin.PRECEDING

        # Double-check that the tag is not in the banned values
        if banned_values is not None and tag in banned_values:
            return name, TagOrigin.NAME

        return tag, origin

    @staticmethod
    def gpt_tag(
        templates_df: pd.DataFrame,
        variable_row: pd.Series,
        model: str = "gpt-3.5-turbo",
        banned_values: Optional[list[str]] = None,
    ) -> str:
        """
        Use GPT to derive a tag the variable described in `variable_row`,
        using information about the corresponding log template, retrieved from `templates_df`.

        Parameters:
            templates_df: The dataframe containing information about the log templates.
            variable_row: The row of the dataframe containing information about the parsed variable.
            model: The GPT model to use.
            banned_values: A list of values that should not be used as tags.

        Returns:
            The GPT-generated tag for the parsed variable name.
        """

        TagUtils.check_fields(variable_row, ["Name", "Examples"])
        TagUtils.check_columns(templates_df, ["TemplateId", "TemplateExample"])

        template_id = ParsedVariableName(variable_row["Name"]).template_id()
        idx = ParsedVariableName(variable_row["Name"]).index()

        line = templates_df[templates_df["TemplateId"] == template_id][
            "TemplateExample"
        ].values[0]
        line_toks = line.split()

        # Define the messages to send to the model
        messages = [
            {
                "role": "system",
                "content": "You are a backend engineer that knows all about the logging infrastructure of a distributed system.",
            },
            {
                "role": "user",
                "content": f"""Generate a tag for the variable that takes the value {line_toks[idx]} """
                f"""in the following log line:\n {line}\n"""
                f"""Here are the 3 tokens that precede the variable: [{', '.join(line_toks[max(idx-3, 0):idx])} ]\n"""
                f"""Here are some more example values for this variable: [{', '.join(variable_row['Examples'])} ]\n"""
                #f"""Make sure the tag matches none of the following values: [{', '.join(banned_values) if banned_values is not None else ''} ]\n"""
                """Return only the tag as a single word, possibly including underscores. DO NOT EVER REPLY WITH MORE THAN ONE WORD.\n""",
            },
        ]

        client = OpenAI()

        tag = (
            client.chat.completions.create(model=model, messages=messages)
            .choices[0]
            .message.content
        )
        tag_length = len(tag.split())
        if tag_length > 1:
            # GPT didn't listen to us and returned a phrase describing the tag.
            # Extract the word between the second-last and last occurrence of double quotes.
            tag = tag.split('"')[-2]


        with open("gpt_log.txt", "a+") as f:
            f.write('----------------------------------\n')
            f.write(f"Variable name: {variable_row['Name']}\n\n")
            f.write(f"Model used: {model}\n\n")
            f.write(f"Messages sent to the model:\n{messages}\n\n")
            f.write(f"Tag generated by the model:\n{tag}\n\n")
            f.flush()

        # Double-check that the tag is not in the banned values
        if banned_values is not None and tag in banned_values:
            with open("gpt_log.txt", "a+") as f:
                f.write('That tag is banned, returning name.\n')
            return variable_row["Name"]

        return tag

    @staticmethod
    def deduplicate_tags(df: pd.DataFrame) -> pd.DataFrame:
        """
        Ensure that the tags in df are unique, by making the tag column of any row
        with a seen-before tag equal to the name column of that row.

        Parameters:
            df: The dataframe to be deduplicated.

        Returns:
            The deduplicated dataframe.
        """

        TagUtils.check_columns(df, ["Name", "Tag", "TagOrigin"])
        seen_tags = set()
        for i, row in df.iterrows():
            if row["Tag"] in seen_tags:
                df.loc[i, "Tag"] = row["Name"]
                df.loc[i, "TagOrigin"] = TagOrigin.NAME
            else:
                seen_tags.add(row["Tag"])

    @staticmethod
    def set_tag(df: pd.DataFrame, name: str, tag: str, info: str = "") -> None:
        """
        Tag a parsed or prepared variable for easier access.

        Parameters:
            df: The dataframe containing the parsed or prepared variables.
            name: The name of the parsed or prepared variable.
            tag: The tag to be set.
            info: A string describing the type of variable being tagged (parsed or prepared).

        Raises:
            ValueError: If the name is not the name of a parsed or prepared variable.
        """
        TagUtils.check_columns(df, ["Name", "Tag"])
        if name in df["Name"].values:
            df.loc[df["Name"] == name, "Tag"] = tag
            Printer.printv(f"Variable {name} tagged as {tag}")
        else:
            raise ValueError(f"{name} is not the name of a {info} variable.")

    @staticmethod
    def get_tag(df: pd.DataFrame, name: str, info: str = "") -> str:
        """
        Retrieve the tag of a parsed or prepared variable.

        Parameters:
            df: The dataframe containing the parsed or prepared variables.
            name: The name of the parsed or prepared variable.
            info: A string describing the type of variable being tagged (parsed or prepared).

        Raises:
            ValueError: If the name is not the name of a parsed or prepared variable.
        """

        TagUtils.check_columns(df, ["Name", "Tag"])
        if name in df["Name"].values:
            return df.loc[df["Name"] == name, "Tag"].values[0]
        else:
            raise ValueError(f"{name} is not the name of a {info} variable.")

    @staticmethod
    def name_of(df: pd.DataFrame, name_or_tag: str, info: str = "") -> str:
        """
        Determine the name of a parsed or prepared variable, given either itself or its tag.

        Parameters:
            df: The dataframe containing the parsed or prepared variables.
            name_or_tag: The name or tag of the parsed or prepared variable.
            info: A string describing the type of variable in question (parsed or prepared).

        Returns:
            The name of the parsed or prepared variable.
        """

        TagUtils.check_columns(df, ["Name", "Tag"])
        name_or_tag = name_or_tag.strip()
        if name_or_tag in df["Name"].values:
            return name_or_tag
        elif name_or_tag in df["Tag"].values:
            return df.loc[df["Tag"] == name_or_tag, "Name"].values[0]
        else:
            raise ValueError(
                f"{name_or_tag} is not the name or tag of a {info} variable."
            )

    @staticmethod
    def tag_of(df: pd.DataFrame, name_or_tag: Optional[str], info: str = "") -> Optional[str]:
        """
        Determine the tag of a parsed or prepared variable, given either itself or its name.
        Retuirn None if the variable is None.

        Parameters:
            df: The dataframe containing the parsed or prepared variables.
            name_or_tag: The name or tag of the parsed or prepared variable.
            info: A string describing the type of variable in question (parsed or prepared).

        Returns:
            The tag of the parsed or prepared variable.
        """

        if name_or_tag is None:
            return None

        TagUtils.check_columns(df, ["Name", "Tag"])
        name_or_tag = name_or_tag.strip()
        if name_or_tag in df["Tag"].values:
            return name_or_tag
        elif name_or_tag in df["Name"].values:
            return df.loc[df["Name"] == name_or_tag, "Tag"].values[0]
        else:
            raise ValueError(
                f"{name_or_tag} is not the name or tag of a {info} variable."
            )

`check_columns(df, columns)` `staticmethod`

Check that the specified columns exist in the dataframe.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	The dataframe to be checked.	required
`columns`	`list`	The columns to be checked.	required

Raises:

Type	Description
`ValueError`	If any of the columns are not present in the dataframe.

Source code in src/logos/tag_utils.py

@staticmethod
def check_columns(df: pd.DataFrame, columns: list) -> None:
    """
    Check that the specified columns exist in the dataframe.

    Parameters:
        df: The dataframe to be checked.
        columns: The columns to be checked.

    Raises:
        ValueError: If any of the columns are not present in the dataframe.
    """
    if not set(columns).issubset(set(df.columns)):
        raise ValueError(f"Columns {columns} are not all present in the dataframe.")

`check_fields(series, fields)` `staticmethod`

Check that the specified fields exist in the specified series.

Parameters:

Name	Type	Description	Default
`series`	`Series`	The series to be checked.	required
`fields`	`list`	The fields to be checked.	required

Raises:

Type	Description
`ValueError`	If any of the fields are not present in the series.

Source code in src/logos/tag_utils.py

@staticmethod
def check_fields(series: pd.Series, fields: list) -> None:
    """
    Check that the specified fields exist in the specified series.

    Parameters:
        series: The series to be checked.
        fields: The fields to be checked.

    Raises:
        ValueError: If any of the fields are not present in the series.
    """
    if not set(fields).issubset(set(series.index)):
        raise ValueError(f"Fields {fields} are not all present in the series.")

`best_effort_tag(templates_df, variable_row, enable_gpt_tagging, gpt_model)` `staticmethod`

Apply gpt_tag to variable_row, if possible, and return the result. If there is no environment variable called OPENAI_API_KEY, or if enable_gpt_tagging is False, apply preceding_tokens_tag instead.

Parameters:

Name	Type	Description	Default
`templates_df`	`DataFrame`	The dataframe containing information about the log templates.	required
`variable_row`	`Series`	The row of the dataframe containing information about the parsed variable.	required
`enable_gpt_tagging`	`bool`	A boolean indicating whether GPT-3.5 tagging should be enabled.	required
`gpt_model`	`str`	The GPT model to use.	required

Returns:

Type	Description
`str`	A tuple containing (i) the GPT-3.5 tag for the parsed variable name, if possible, or the
`bool`	best-effort tag otherwise, and (ii) a boolean indicating whether the GPT-3.5 tag was used.

Source code in src/logos/tag_utils.py

@staticmethod
def best_effort_tag(
    templates_df: pd.DataFrame,
    variable_row: pd.Series,
    enable_gpt_tagging: bool,
    gpt_model: str,
) -> tuple[str, bool]:
    """
    Apply `gpt_tag` to `variable_row`, if possible, and return the result. If there is
    no environment variable called OPENAI_API_KEY, or if `enable_gpt_tagging` is False,
    apply `preceding_tokens_tag` instead.

    Parameters:
        templates_df: The dataframe containing information about the log templates.
        variable_row: The row of the dataframe containing information about the parsed variable.
        enable_gpt_tagging: A boolean indicating whether GPT-3.5 tagging should be enabled.
        gpt_model: The GPT model to use.

    Returns:
        A tuple containing (i) the GPT-3.5 tag for the parsed variable name, if possible, or the
        best-effort tag otherwise, and (ii) a boolean indicating whether the GPT-3.5 tag was used.
    """
    if enable_gpt_tagging:
        try:
            return (TagUtils.gpt_tag(templates_df, variable_row, gpt_model), True)
        except:
            return (TagUtils.preceding_tokens_tag(variable_row), False)
    else:
        return (TagUtils.preceding_tokens_tag(variable_row), False)

`waterfall_tag(templates_df, variable_row, banned_values=None)` `staticmethod`

Apply each of the tagging methods in turn, in order of increasing cost, until a tag is found that is not included in the banned values. In partidular, apply preceding_tokens_tag first, then gpt_tag with the GPT-3.5 model, and finally gpt_tag with the GPT-4 model. If none of these methods succeeds, return the name of the variable as the tag.

Parameters:

Name	Type	Description	Default
`templates_df`	`DataFrame`	The dataframe containing information about the log templates.	required
`variable_row`	`Series`	The row of the dataframe containing information about the parsed variable.	required
`banned_values`	`Optional[list[str]]`	A list of values that should not be used as tags.	`None`

Returns:

Type	Description
`tuple[str, TagOrigin]`	A tuple containing (i) the tag for the parsed variable, and (ii) the origin of the tag.

Source code in src/logos/tag_utils.py

@staticmethod
def waterfall_tag(
    templates_df: pd.DataFrame,
    variable_row: pd.Series,
    banned_values: Optional[list[str]] = None,
) -> tuple[str, TagOrigin]:
    """
    Apply each of the tagging methods in turn, in order of increasing cost, until a tag is found
    that is not included in the banned values. In partidular, apply `preceding_tokens_tag` first,
    then `gpt_tag` with the GPT-3.5 model, and finally `gpt_tag` with the GPT-4 model. If none of
    these methods succeeds, return the name of the variable as the tag.

    Parameters:
        templates_df: The dataframe containing information about the log templates.
        variable_row: The row of the dataframe containing information about the parsed variable.
        banned_values: A list of values that should not be used as tags.

    Returns:
        A tuple containing (i) the tag for the parsed variable, and (ii) the origin of the tag.
    """
    name = variable_row["Name"]
    if variable_row["From regex"]:
        return (name, TagOrigin.REGEX_VARIABLE)

    # Try to derive a tag from the preceding tokens in the corresponding template
    tag, origin = TagUtils.preceding_tokens_tag(variable_row, banned_values)
    if tag != name:
        return (tag, origin)

    # Try to derive a tag using GPT-3.5
    try:
        tag = TagUtils.gpt_tag(
            templates_df, variable_row, "gpt-3.5-turbo", banned_values
        )
        if tag != name:
            return (tag, TagOrigin.GPT_3POINT5_TURBO)
    except Exception as e:
        print(f"Exception {e} came up while tagging {name} with GPT-3.5.")
        pass

    # Try to derive a tag using GPT-4
    try:
        tag = TagUtils.gpt_tag(templates_df, variable_row, "gpt-4", banned_values)
        if tag != name:
            return (tag, TagOrigin.GPT_4)
    except Exception as e:
        print(f"Exception {e} came up while tagging {name} with GPT-4.")
        pass

    return (name, TagOrigin.NAME)

`preceding_tokens_tag(variable_row, banned_values=None)` `staticmethod`

Try to derive a tag for a parsed variable name based on the preceding tokens in the corresponding template.

Parameters:

Name	Type	Description	Default
`variable_row`	`Series`	The row of the dataframe containing information about the parsed variable.	required
`banned_values`	`Optional[list[str]]`	A list of values that should not be used as tags.	`None`

Returns:

Type	Description
`tuple[str, TagOrigin]`	A tuple containing (i) the tag for the parsed variable, and (ii) the origin of the tag.

Source code in src/logos/tag_utils.py

@staticmethod
def preceding_tokens_tag(
    variable_row: pd.Series, banned_values: Optional[list[str]] = None
) -> tuple[str, TagOrigin]:
    """
    Try to derive a tag for a parsed variable name based on the preceding tokens in the corresponding template.

    Parameters:
        variable_row: The row of the dataframe containing information about the parsed variable.
        banned_values: A list of values that should not be used as tags.

    Returns:
        A tuple containing (i) the tag for the parsed variable, and (ii) the origin of the tag.
    """

    TagUtils.check_fields(variable_row, ["Preceding 3 tokens", "Name", "From regex"])
    name = variable_row["Name"]
    if variable_row["From regex"]:
        return name, TagOrigin.REGEX_VARIABLE

    pr = variable_row["Preceding 3 tokens"]
    tag = name
    origin = TagOrigin.NAME
    if len(pr) >= 2 and (pr[-1] in ":=") and (pr[-2][0] != "<"):
        tag = pr[-2]
        origin = TagOrigin.PRECEDING
    elif (
        len(pr) == 3
        and (pr[2] in """"'""")
        and (pr[1] in ":=")
        and (pr[0][0] != "<")
    ):
        tag = pr[0]
        origin = TagOrigin.PRECEDING

    # Double-check that the tag is not in the banned values
    if banned_values is not None and tag in banned_values:
        return name, TagOrigin.NAME

    return tag, origin

`gpt_tag(templates_df, variable_row, model='gpt-3.5-turbo', banned_values=None)` `staticmethod`

Use GPT to derive a tag the variable described in variable_row, using information about the corresponding log template, retrieved from templates_df.

Parameters:

Name	Type	Description	Default
`templates_df`	`DataFrame`	The dataframe containing information about the log templates.	required
`variable_row`	`Series`	The row of the dataframe containing information about the parsed variable.	required
`model`	`str`	The GPT model to use.	`'gpt-3.5-turbo'`
`banned_values`	`Optional[list[str]]`	A list of values that should not be used as tags.	`None`

Returns:

Type	Description
`str`	The GPT-generated tag for the parsed variable name.

Source code in src/logos/tag_utils.py

@staticmethod
def gpt_tag(
    templates_df: pd.DataFrame,
    variable_row: pd.Series,
    model: str = "gpt-3.5-turbo",
    banned_values: Optional[list[str]] = None,
) -> str:
    """
    Use GPT to derive a tag the variable described in `variable_row`,
    using information about the corresponding log template, retrieved from `templates_df`.

    Parameters:
        templates_df: The dataframe containing information about the log templates.
        variable_row: The row of the dataframe containing information about the parsed variable.
        model: The GPT model to use.
        banned_values: A list of values that should not be used as tags.

    Returns:
        The GPT-generated tag for the parsed variable name.
    """

    TagUtils.check_fields(variable_row, ["Name", "Examples"])
    TagUtils.check_columns(templates_df, ["TemplateId", "TemplateExample"])

    template_id = ParsedVariableName(variable_row["Name"]).template_id()
    idx = ParsedVariableName(variable_row["Name"]).index()

    line = templates_df[templates_df["TemplateId"] == template_id][
        "TemplateExample"
    ].values[0]
    line_toks = line.split()

    # Define the messages to send to the model
    messages = [
        {
            "role": "system",
            "content": "You are a backend engineer that knows all about the logging infrastructure of a distributed system.",
        },
        {
            "role": "user",
            "content": f"""Generate a tag for the variable that takes the value {line_toks[idx]} """
            f"""in the following log line:\n {line}\n"""
            f"""Here are the 3 tokens that precede the variable: [{', '.join(line_toks[max(idx-3, 0):idx])} ]\n"""
            f"""Here are some more example values for this variable: [{', '.join(variable_row['Examples'])} ]\n"""
            #f"""Make sure the tag matches none of the following values: [{', '.join(banned_values) if banned_values is not None else ''} ]\n"""
            """Return only the tag as a single word, possibly including underscores. DO NOT EVER REPLY WITH MORE THAN ONE WORD.\n""",
        },
    ]

    client = OpenAI()

    tag = (
        client.chat.completions.create(model=model, messages=messages)
        .choices[0]
        .message.content
    )
    tag_length = len(tag.split())
    if tag_length > 1:
        # GPT didn't listen to us and returned a phrase describing the tag.
        # Extract the word between the second-last and last occurrence of double quotes.
        tag = tag.split('"')[-2]


    with open("gpt_log.txt", "a+") as f:
        f.write('----------------------------------\n')
        f.write(f"Variable name: {variable_row['Name']}\n\n")
        f.write(f"Model used: {model}\n\n")
        f.write(f"Messages sent to the model:\n{messages}\n\n")
        f.write(f"Tag generated by the model:\n{tag}\n\n")
        f.flush()

    # Double-check that the tag is not in the banned values
    if banned_values is not None and tag in banned_values:
        with open("gpt_log.txt", "a+") as f:
            f.write('That tag is banned, returning name.\n')
        return variable_row["Name"]

    return tag

`deduplicate_tags(df)` `staticmethod`

Ensure that the tags in df are unique, by making the tag column of any row with a seen-before tag equal to the name column of that row.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	The dataframe to be deduplicated.	required

Returns:

Type	Description
`DataFrame`	The deduplicated dataframe.

Source code in src/logos/tag_utils.py

@staticmethod
def deduplicate_tags(df: pd.DataFrame) -> pd.DataFrame:
    """
    Ensure that the tags in df are unique, by making the tag column of any row
    with a seen-before tag equal to the name column of that row.

    Parameters:
        df: The dataframe to be deduplicated.

    Returns:
        The deduplicated dataframe.
    """

    TagUtils.check_columns(df, ["Name", "Tag", "TagOrigin"])
    seen_tags = set()
    for i, row in df.iterrows():
        if row["Tag"] in seen_tags:
            df.loc[i, "Tag"] = row["Name"]
            df.loc[i, "TagOrigin"] = TagOrigin.NAME
        else:
            seen_tags.add(row["Tag"])

`set_tag(df, name, tag, info='')` `staticmethod`

Tag a parsed or prepared variable for easier access.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	The dataframe containing the parsed or prepared variables.	required
`name`	`str`	The name of the parsed or prepared variable.	required
`tag`	`str`	The tag to be set.	required
`info`	`str`	A string describing the type of variable being tagged (parsed or prepared).	`''`

Raises:

Type	Description
`ValueError`	If the name is not the name of a parsed or prepared variable.

Source code in src/logos/tag_utils.py

@staticmethod
def set_tag(df: pd.DataFrame, name: str, tag: str, info: str = "") -> None:
    """
    Tag a parsed or prepared variable for easier access.

    Parameters:
        df: The dataframe containing the parsed or prepared variables.
        name: The name of the parsed or prepared variable.
        tag: The tag to be set.
        info: A string describing the type of variable being tagged (parsed or prepared).

    Raises:
        ValueError: If the name is not the name of a parsed or prepared variable.
    """
    TagUtils.check_columns(df, ["Name", "Tag"])
    if name in df["Name"].values:
        df.loc[df["Name"] == name, "Tag"] = tag
        Printer.printv(f"Variable {name} tagged as {tag}")
    else:
        raise ValueError(f"{name} is not the name of a {info} variable.")

`get_tag(df, name, info='')` `staticmethod`

Retrieve the tag of a parsed or prepared variable.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	The dataframe containing the parsed or prepared variables.	required
`name`	`str`	The name of the parsed or prepared variable.	required
`info`	`str`	A string describing the type of variable being tagged (parsed or prepared).	`''`

Raises:

Type	Description
`ValueError`	If the name is not the name of a parsed or prepared variable.

Source code in src/logos/tag_utils.py

@staticmethod
def get_tag(df: pd.DataFrame, name: str, info: str = "") -> str:
    """
    Retrieve the tag of a parsed or prepared variable.

    Parameters:
        df: The dataframe containing the parsed or prepared variables.
        name: The name of the parsed or prepared variable.
        info: A string describing the type of variable being tagged (parsed or prepared).

    Raises:
        ValueError: If the name is not the name of a parsed or prepared variable.
    """

    TagUtils.check_columns(df, ["Name", "Tag"])
    if name in df["Name"].values:
        return df.loc[df["Name"] == name, "Tag"].values[0]
    else:
        raise ValueError(f"{name} is not the name of a {info} variable.")

`name_of(df, name_or_tag, info='')` `staticmethod`

Determine the name of a parsed or prepared variable, given either itself or its tag.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	The dataframe containing the parsed or prepared variables.	required
`name_or_tag`	`str`	The name or tag of the parsed or prepared variable.	required
`info`	`str`	A string describing the type of variable in question (parsed or prepared).	`''`

Returns:

Type	Description
`str`	The name of the parsed or prepared variable.

Source code in src/logos/tag_utils.py

@staticmethod
def name_of(df: pd.DataFrame, name_or_tag: str, info: str = "") -> str:
    """
    Determine the name of a parsed or prepared variable, given either itself or its tag.

    Parameters:
        df: The dataframe containing the parsed or prepared variables.
        name_or_tag: The name or tag of the parsed or prepared variable.
        info: A string describing the type of variable in question (parsed or prepared).

    Returns:
        The name of the parsed or prepared variable.
    """

    TagUtils.check_columns(df, ["Name", "Tag"])
    name_or_tag = name_or_tag.strip()
    if name_or_tag in df["Name"].values:
        return name_or_tag
    elif name_or_tag in df["Tag"].values:
        return df.loc[df["Tag"] == name_or_tag, "Name"].values[0]
    else:
        raise ValueError(
            f"{name_or_tag} is not the name or tag of a {info} variable."
        )

`tag_of(df, name_or_tag, info='')` `staticmethod`

Determine the tag of a parsed or prepared variable, given either itself or its name. Retuirn None if the variable is None.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	The dataframe containing the parsed or prepared variables.	required
`name_or_tag`	`Optional[str]`	The name or tag of the parsed or prepared variable.	required
`info`	`str`	A string describing the type of variable in question (parsed or prepared).	`''`

Returns:

Type	Description
`Optional[str]`	The tag of the parsed or prepared variable.

Source code in src/logos/tag_utils.py

@staticmethod
def tag_of(df: pd.DataFrame, name_or_tag: Optional[str], info: str = "") -> Optional[str]:
    """
    Determine the tag of a parsed or prepared variable, given either itself or its name.
    Retuirn None if the variable is None.

    Parameters:
        df: The dataframe containing the parsed or prepared variables.
        name_or_tag: The name or tag of the parsed or prepared variable.
        info: A string describing the type of variable in question (parsed or prepared).

    Returns:
        The tag of the parsed or prepared variable.
    """

    if name_or_tag is None:
        return None

    TagUtils.check_columns(df, ["Name", "Tag"])
    name_or_tag = name_or_tag.strip()
    if name_or_tag in df["Tag"].values:
        return name_or_tag
    elif name_or_tag in df["Name"].values:
        return df.loc[df["Name"] == name_or_tag, "Tag"].values[0]
    else:
        raise ValueError(
            f"{name_or_tag} is not the name or tag of a {info} variable."
        )

TagUtils

TagOrigin

PRECEDING: int = 0 class-attribute instance-attribute

GPT_3POINT5_TURBO: int = 1 class-attribute instance-attribute

GPT_4: int = 2 class-attribute instance-attribute

NAME: int = 3 class-attribute instance-attribute

REGEX_VARIABLE: int = 4 class-attribute instance-attribute

TagUtils

check_columns(df, columns) staticmethod

check_fields(series, fields) staticmethod

best_effort_tag(templates_df, variable_row, enable_gpt_tagging, gpt_model) staticmethod

waterfall_tag(templates_df, variable_row, banned_values=None) staticmethod

preceding_tokens_tag(variable_row, banned_values=None) staticmethod

gpt_tag(templates_df, variable_row, model='gpt-3.5-turbo', banned_values=None) staticmethod

deduplicate_tags(df) staticmethod

set_tag(df, name, tag, info='') staticmethod

get_tag(df, name, info='') staticmethod

name_of(df, name_or_tag, info='') staticmethod

tag_of(df, name_or_tag, info='') staticmethod

`TagOrigin`

`PRECEDING: int = 0` `class-attribute` `instance-attribute`

`GPT_3POINT5_TURBO: int = 1` `class-attribute` `instance-attribute`

`GPT_4: int = 2` `class-attribute` `instance-attribute`

`NAME: int = 3` `class-attribute` `instance-attribute`

`REGEX_VARIABLE: int = 4` `class-attribute` `instance-attribute`

`TagUtils`

`check_columns(df, columns)` `staticmethod`

`check_fields(series, fields)` `staticmethod`

`best_effort_tag(templates_df, variable_row, enable_gpt_tagging, gpt_model)` `staticmethod`

`waterfall_tag(templates_df, variable_row, banned_values=None)` `staticmethod`

`preceding_tokens_tag(variable_row, banned_values=None)` `staticmethod`

`gpt_tag(templates_df, variable_row, model='gpt-3.5-turbo', banned_values=None)` `staticmethod`

`deduplicate_tags(df)` `staticmethod`

`set_tag(df, name, tag, info='')` `staticmethod`

`get_tag(df, name, info='')` `staticmethod`

`name_of(df, name_or_tag, info='')` `staticmethod`

`tag_of(df, name_or_tag, info='')` `staticmethod`