Infrastructure

Define a standardisation service for Bracken profiles.

Source code in src/taxpasta/infrastructure/application/bracken/bracken_profile_standardisation_service.py

class BrackenProfileStandardisationService(ProfileStandardisationService):
    """Define a standardisation service for Bracken profiles."""

    @classmethod
    @pa.check_types(lazy=True)
    def transform(
        cls, profile: DataFrame[BrackenProfile]
    ) -> DataFrame[StandardProfile]:
        """
        Tidy up and standardize a given Bracken profile.

        Args:
            profile: A taxonomic profile generated by Bracken.

        Returns:
            A standardized profile.

        Raises:
            pandera.errors.SchemaErrors: If the given profile does not conform with the
                `BrackenProfile` or the transformed output does not conform with the
                `StandardProfile`.  # noqa: DAR402

        """
        return (
            profile[[BrackenProfile.taxonomy_id, BrackenProfile.new_est_reads]]
            .copy()
            .rename(
                columns={
                    BrackenProfile.taxonomy_id: StandardProfile.taxonomy_id,
                    BrackenProfile.new_est_reads: StandardProfile.count,
                }
            )
        )

Functions¶

transform(profile: DataFrame[BrackenProfile]) -> DataFrame[StandardProfile] classmethod ¶

Tidy up and standardize a given Bracken profile.

Parameters:

Name	Type	Description	Default
`profile`	`DataFrame[BrackenProfile]`	A taxonomic profile generated by Bracken.	required

Returns:

Type	Description
`DataFrame[StandardProfile]`	A standardized profile.

Raises:

Type	Description
`SchemaErrors`	If the given profile does not conform with the `BrackenProfile` or the transformed output does not conform with the `StandardProfile`. # noqa: DAR402

Source code in src/taxpasta/infrastructure/application/bracken/bracken_profile_standardisation_service.py

@classmethod
@pa.check_types(lazy=True)
def transform(
    cls, profile: DataFrame[BrackenProfile]
) -> DataFrame[StandardProfile]:
    """
    Tidy up and standardize a given Bracken profile.

    Args:
        profile: A taxonomic profile generated by Bracken.

    Returns:
        A standardized profile.

    Raises:
        pandera.errors.SchemaErrors: If the given profile does not conform with the
            `BrackenProfile` or the transformed output does not conform with the
            `StandardProfile`.  # noqa: DAR402

    """
    return (
        profile[[BrackenProfile.taxonomy_id, BrackenProfile.new_est_reads]]
        .copy()
        .rename(
            columns={
                BrackenProfile.taxonomy_id: StandardProfile.taxonomy_id,
                BrackenProfile.new_est_reads: StandardProfile.count,
            }
        )
    )

`centrifuge` ¶

Classes¶

Modules¶

centrifuge_profile ¶

Provide a description of the centrifuge profile format.

Attributes¶

CENTRIFUGE_PERCENT_TOLERANCE = 1.0 module-attribute ¶

CENTRIFUGE_PERCENT_TOTAL = 100.0 module-attribute ¶

Classes¶

CentrifugeProfile ¶

Bases: BaseDataFrameModel

Define the expected centrifuge profile format.

Source code in src/taxpasta/infrastructure/application/centrifuge/centrifuge_profile.py

class CentrifugeProfile(BaseDataFrameModel):
    """Define the expected centrifuge profile format."""

    percent: Series[float] = pa.Field(ge=0.0, le=100.0)
    clade_assigned_reads: Series[int] = pa.Field(ge=0)
    direct_assigned_reads: Series[int] = pa.Field(ge=0)
    taxonomy_level: Series[str] = pa.Field()
    taxonomy_id: Series[int] = pa.Field(ge=0)
    name: Series[str] = pa.Field()

    @pa.check("percent", name="compositionality", raise_warning=True)
    def check_compositionality(cls, percent: Series[float]) -> bool:
        """Check that the percent of 'unclassified' and 'root' add up to a hundred."""
        return percent.empty or bool(
            np.isclose(
                percent[:2].sum(),
                CENTRIFUGE_PERCENT_TOTAL,
                atol=CENTRIFUGE_PERCENT_TOLERANCE,
            )
        )

Attributes¶

clade_assigned_reads: Series[int] = pa.Field(ge=0) class-attribute instance-attribute ¶

direct_assigned_reads: Series[int] = pa.Field(ge=0) class-attribute instance-attribute ¶

name: Series[str] = pa.Field() class-attribute instance-attribute ¶

percent: Series[float] = pa.Field(ge=0.0, le=100.0) class-attribute instance-attribute ¶

taxonomy_id: Series[int] = pa.Field(ge=0) class-attribute instance-attribute ¶

taxonomy_level: Series[str] = pa.Field() class-attribute instance-attribute ¶

Functions¶

check_compositionality(percent: Series[float]) -> bool ¶

Check that the percent of 'unclassified' and 'root' add up to a hundred.

Source code in src/taxpasta/infrastructure/application/centrifuge/centrifuge_profile.py

@pa.check("percent", name="compositionality", raise_warning=True)
def check_compositionality(cls, percent: Series[float]) -> bool:
    """Check that the percent of 'unclassified' and 'root' add up to a hundred."""
    return percent.empty or bool(
        np.isclose(
            percent[:2].sum(),
            CENTRIFUGE_PERCENT_TOTAL,
            atol=CENTRIFUGE_PERCENT_TOLERANCE,
        )
    )

centrifuge_profile_reader ¶

Provide a reader for Centrifuge profiles.

Attributes¶ Classes¶

CentrifugeProfileReader ¶

Bases: ProfileReader

Define a reader for centrifuge profiles.

Source code in src/taxpasta/infrastructure/application/centrifuge/centrifuge_profile_reader.py

class CentrifugeProfileReader(ProfileReader):
    """Define a reader for centrifuge profiles."""

    @classmethod
    @raise_parser_warnings
    def read(cls, profile: BufferOrFilepath) -> DataFrame[CentrifugeProfile]:
        """
        Read a centrifuge taxonomic profile from the given source.

        Args:
            profile: A source that contains a tab-separated taxonomic profile generated
                by centrifuge.

        Returns:
            A data frame representation of the centrifuge profile.

        """
        result = pd.read_table(
            filepath_or_buffer=profile,
            sep="\t",
            header=None,
            index_col=False,
            skipinitialspace=True,
        )
        cls._check_num_columns(result, CentrifugeProfile)
        result.columns = [
            CentrifugeProfile.percent,
            CentrifugeProfile.clade_assigned_reads,
            CentrifugeProfile.direct_assigned_reads,
            CentrifugeProfile.taxonomy_level,
            CentrifugeProfile.taxonomy_id,
            CentrifugeProfile.name,
        ]
        return result

Functions¶

read(profile: BufferOrFilepath) -> DataFrame[CentrifugeProfile] classmethod ¶

Read a centrifuge taxonomic profile from the given source.

Parameters:

Name	Type	Description	Default
`profile`	`BufferOrFilepath`	A source that contains a tab-separated taxonomic profile generated by centrifuge.	required

Returns:

Type	Description
`DataFrame[CentrifugeProfile]`	A data frame representation of the centrifuge profile.

Source code in src/taxpasta/infrastructure/application/centrifuge/centrifuge_profile_reader.py

@classmethod
@raise_parser_warnings
def read(cls, profile: BufferOrFilepath) -> DataFrame[CentrifugeProfile]:
    """
    Read a centrifuge taxonomic profile from the given source.

    Args:
        profile: A source that contains a tab-separated taxonomic profile generated
            by centrifuge.

    Returns:
        A data frame representation of the centrifuge profile.

    """
    result = pd.read_table(
        filepath_or_buffer=profile,
        sep="\t",
        header=None,
        index_col=False,
        skipinitialspace=True,
    )
    cls._check_num_columns(result, CentrifugeProfile)
    result.columns = [
        CentrifugeProfile.percent,
        CentrifugeProfile.clade_assigned_reads,
        CentrifugeProfile.direct_assigned_reads,
        CentrifugeProfile.taxonomy_level,
        CentrifugeProfile.taxonomy_id,
        CentrifugeProfile.name,
    ]
    return result

Functions¶

centrifuge_profile_standardisation_service ¶

Provide a standardisation service for centrifuge profiles.

Attributes¶

logger = logging.getLogger(__name__) module-attribute ¶

Classes¶

CentrifugeProfileStandardisationService ¶

Define a standardisation service for centrifuge profiles.

Source code in src/taxpasta/infrastructure/application/centrifuge/centrifuge_profile_standardisation_service.py

class CentrifugeProfileStandardisationService(ProfileStandardisationService):
    """Define a standardisation service for centrifuge profiles."""

    @classmethod
    @pa.check_types(lazy=True)
    def transform(
        cls, profile: DataFrame[CentrifugeProfile]
    ) -> DataFrame[StandardProfile]:
        """
        Tidy up and standardize a given centrifuge profile.

        Args:
            profile: A taxonomic profile generated by centrifuge.

        Returns:
            A standardized profile.

        """
        return (
            profile[
                [CentrifugeProfile.taxonomy_id, CentrifugeProfile.direct_assigned_reads]
            ]
            .copy()
            .rename(
                columns={
                    CentrifugeProfile.taxonomy_id: StandardProfile.taxonomy_id,
                    CentrifugeProfile.direct_assigned_reads: StandardProfile.count,
                }
            )
        )

Functions¶

transform(profile: DataFrame[CentrifugeProfile]) -> DataFrame[StandardProfile] classmethod ¶

Tidy up and standardize a given centrifuge profile.

Parameters:

Name	Type	Description	Default
`profile`	`DataFrame[CentrifugeProfile]`	A taxonomic profile generated by centrifuge.	required

Returns:

Type	Description
`DataFrame[StandardProfile]`	A standardized profile.

Source code in src/taxpasta/infrastructure/application/centrifuge/centrifuge_profile_standardisation_service.py

@classmethod
@pa.check_types(lazy=True)
def transform(
    cls, profile: DataFrame[CentrifugeProfile]
) -> DataFrame[StandardProfile]:
    """
    Tidy up and standardize a given centrifuge profile.

    Args:
        profile: A taxonomic profile generated by centrifuge.

    Returns:
        A standardized profile.

    """
    return (
        profile[
            [CentrifugeProfile.taxonomy_id, CentrifugeProfile.direct_assigned_reads]
        ]
        .copy()
        .rename(
            columns={
                CentrifugeProfile.taxonomy_id: StandardProfile.taxonomy_id,
                CentrifugeProfile.direct_assigned_reads: StandardProfile.count,
            }
        )
    )

`diamond` ¶

Classes¶

Modules¶

diamond_profile ¶

Provide a description of the diamond profile format.

Classes¶

DiamondProfile ¶

Bases: BaseDataFrameModel

Define the expected diamond profile format.

Source code in src/taxpasta/infrastructure/application/diamond/diamond_profile.py

class DiamondProfile(BaseDataFrameModel):
    """Define the expected diamond profile format."""

    query_id: Series[str] = pa.Field()
    taxonomy_id: Series[int] = pa.Field(ge=0)
    e_value: Series[float] = pa.Field(ge=0.0, le=1.0)

Attributes¶

e_value: Series[float] = pa.Field(ge=0.0, le=1.0) class-attribute instance-attribute ¶

query_id: Series[str] = pa.Field() class-attribute instance-attribute ¶

taxonomy_id: Series[int] = pa.Field(ge=0) class-attribute instance-attribute ¶

diamond_profile_reader ¶

Provide a reader for diamond profiles.

Attributes¶ Classes¶

DiamondProfileReader ¶

Bases: ProfileReader

Define a reader for Diamond profiles.

Source code in src/taxpasta/infrastructure/application/diamond/diamond_profile_reader.py

class DiamondProfileReader(ProfileReader):
    """Define a reader for Diamond profiles."""

    @classmethod
    @raise_parser_warnings
    def read(cls, profile: BufferOrFilepath) -> DataFrame[DiamondProfile]:
        """Read a diamond taxonomic profile from a file."""
        result = pd.read_table(
            filepath_or_buffer=profile,
            sep="\t",
            header=None,
            index_col=False,
            dtype={2: float},
        )
        cls._check_num_columns(result, DiamondProfile)
        result.columns = [
            DiamondProfile.query_id,
            DiamondProfile.taxonomy_id,
            DiamondProfile.e_value,
        ]
        return result

Functions¶

read(profile: BufferOrFilepath) -> DataFrame[DiamondProfile] classmethod ¶

Read a diamond taxonomic profile from a file.

Source code in src/taxpasta/infrastructure/application/diamond/diamond_profile_reader.py

@classmethod
@raise_parser_warnings
def read(cls, profile: BufferOrFilepath) -> DataFrame[DiamondProfile]:
    """Read a diamond taxonomic profile from a file."""
    result = pd.read_table(
        filepath_or_buffer=profile,
        sep="\t",
        header=None,
        index_col=False,
        dtype={2: float},
    )
    cls._check_num_columns(result, DiamondProfile)
    result.columns = [
        DiamondProfile.query_id,
        DiamondProfile.taxonomy_id,
        DiamondProfile.e_value,
    ]
    return result

Functions¶

diamond_profile_standardisation_service ¶

Provide a standardisation service for diamond profiles.

Classes¶

DiamondProfileStandardisationService ¶

Define a standardisation service for diamond profiles.

Source code in src/taxpasta/infrastructure/application/diamond/diamond_profile_standardisation_service.py

class DiamondProfileStandardisationService(ProfileStandardisationService):
    """Define a standardisation service for diamond profiles."""

    @classmethod
    @pa.check_types(lazy=True)
    def transform(
        cls, profile: DataFrame[DiamondProfile]
    ) -> DataFrame[StandardProfile]:
        """
        Tidy up and standardize a given diamond profile.

        Args:
            profile: A taxonomic profile generated by diamond.

        Returns:
            A standardized profile.

        """
        # Sum up occurrences of taxonomy identifiers to yield read count.
        return (
            profile[[DiamondProfile.taxonomy_id]]
            .groupby(DiamondProfile.taxonomy_id, sort=False)
            .size()
            .reset_index()
            .rename(
                columns={
                    DiamondProfile.taxonomy_id: StandardProfile.taxonomy_id,
                    0: StandardProfile.count,
                }
            )
        )

Functions¶

transform(profile: DataFrame[DiamondProfile]) -> DataFrame[StandardProfile] classmethod ¶

Tidy up and standardize a given diamond profile.

Parameters:

Name	Type	Description	Default
`profile`	`DataFrame[DiamondProfile]`	A taxonomic profile generated by diamond.	required

Returns:

Type	Description
`DataFrame[StandardProfile]`	A standardized profile.

Source code in src/taxpasta/infrastructure/application/diamond/diamond_profile_standardisation_service.py

@classmethod
@pa.check_types(lazy=True)
def transform(
    cls, profile: DataFrame[DiamondProfile]
) -> DataFrame[StandardProfile]:
    """
    Tidy up and standardize a given diamond profile.

    Args:
        profile: A taxonomic profile generated by diamond.

    Returns:
        A standardized profile.

    """
    # Sum up occurrences of taxonomy identifiers to yield read count.
    return (
        profile[[DiamondProfile.taxonomy_id]]
        .groupby(DiamondProfile.taxonomy_id, sort=False)
        .size()
        .reset_index()
        .rename(
            columns={
                DiamondProfile.taxonomy_id: StandardProfile.taxonomy_id,
                0: StandardProfile.count,
            }
        )
    )

`ganon` ¶

Classes¶

Modules¶

ganon_profile ¶

Provide a description of the ganon profile format.

Attributes¶

GANON_PERCENT_TOLERANCE = 1.0 module-attribute ¶

GANON_PERCENT_TOTAL = 100.0 module-attribute ¶

Classes¶

GanonProfile ¶

Bases: BaseDataFrameModel

Define the expected ganon profile format.

Source code in src/taxpasta/infrastructure/application/ganon/ganon_profile.py

class GanonProfile(BaseDataFrameModel):
    """Define the expected ganon profile format."""

    rank: Series[str] = pa.Field()
    target: Series[str] = pa.Field()
    lineage: Series[str] = pa.Field()
    name: Series[str] = pa.Field()
    number_unique: Series[int] = pa.Field(ge=0)
    number_shared: Series[int] = pa.Field(ge=0)
    number_children: Series[int] = pa.Field(ge=0)
    number_cumulative: Series[int] = pa.Field(ge=0)
    percent_cumulative: Series[float] = pa.Field(ge=0.0, le=100.0)

    @pa.dataframe_check(name="compositionality", raise_warning=True)
    def check_compositionality(cls, profile: pd.DataFrame) -> bool:
        """Check that the percent of 'unclassified' and 'root' add up to a hundred."""
        # Ganon reports percentage to 5 decimal places, but rounding errors do add up.
        return profile.empty or bool(
            np.isclose(
                profile.loc[
                    profile[cls.rank].isin(["unclassified", "root"]),
                    cls.percent_cumulative,
                ].sum(),
                GANON_PERCENT_TOTAL,
                atol=GANON_PERCENT_TOLERANCE,
            )
        )

Attributes¶

lineage: Series[str] = pa.Field() class-attribute instance-attribute ¶

name: Series[str] = pa.Field() class-attribute instance-attribute ¶

number_children: Series[int] = pa.Field(ge=0) class-attribute instance-attribute ¶

number_cumulative: Series[int] = pa.Field(ge=0) class-attribute instance-attribute ¶

number_shared: Series[int] = pa.Field(ge=0) class-attribute instance-attribute ¶

number_unique: Series[int] = pa.Field(ge=0) class-attribute instance-attribute ¶

percent_cumulative: Series[float] = pa.Field(ge=0.0, le=100.0) class-attribute instance-attribute ¶

rank: Series[str] = pa.Field() class-attribute instance-attribute ¶

target: Series[str] = pa.Field() class-attribute instance-attribute ¶

Functions¶

check_compositionality(profile: pd.DataFrame) -> bool ¶

Check that the percent of 'unclassified' and 'root' add up to a hundred.

Source code in src/taxpasta/infrastructure/application/ganon/ganon_profile.py

@pa.dataframe_check(name="compositionality", raise_warning=True)
def check_compositionality(cls, profile: pd.DataFrame) -> bool:
    """Check that the percent of 'unclassified' and 'root' add up to a hundred."""
    # Ganon reports percentage to 5 decimal places, but rounding errors do add up.
    return profile.empty or bool(
        np.isclose(
            profile.loc[
                profile[cls.rank].isin(["unclassified", "root"]),
                cls.percent_cumulative,
            ].sum(),
            GANON_PERCENT_TOTAL,
            atol=GANON_PERCENT_TOLERANCE,
        )
    )

ganon_profile_reader ¶

Provide a reader for ganon profiles.

Attributes¶ Classes¶

GanonProfileReader ¶

Bases: ProfileReader

Define a reader for ganon profiles.

Source code in src/taxpasta/infrastructure/application/ganon/ganon_profile_reader.py

class GanonProfileReader(ProfileReader):
    """Define a reader for ganon profiles."""

    @classmethod
    @raise_parser_warnings
    def read(cls, profile: BufferOrFilepath) -> DataFrame[GanonProfile]:
        """
        Read a ganon taxonomic profile from the given source.

        Args:
            profile: A source that contains a tab-separated taxonomic profile generated
                by ganon.

        Returns:
            A data frame representation of the ganon profile.

        """
        result = pd.read_table(
            filepath_or_buffer=profile,
            sep="\t",
            header=None,
            index_col=False,
            skipinitialspace=True,
        )
        cls._check_num_columns(result, GanonProfile)
        result.columns = [
            GanonProfile.rank,
            GanonProfile.target,
            GanonProfile.lineage,
            GanonProfile.name,
            GanonProfile.number_unique,
            GanonProfile.number_shared,
            GanonProfile.number_children,
            GanonProfile.number_cumulative,
            GanonProfile.percent_cumulative,
        ]
        return result

Functions¶

read(profile: BufferOrFilepath) -> DataFrame[GanonProfile] classmethod ¶

Read a ganon taxonomic profile from the given source.

Parameters:

Name	Type	Description	Default
`profile`	`BufferOrFilepath`	A source that contains a tab-separated taxonomic profile generated by ganon.	required

Returns:

Type	Description
`DataFrame[GanonProfile]`	A data frame representation of the ganon profile.

Source code in src/taxpasta/infrastructure/application/ganon/ganon_profile_reader.py

@classmethod
@raise_parser_warnings
def read(cls, profile: BufferOrFilepath) -> DataFrame[GanonProfile]:
    """
    Read a ganon taxonomic profile from the given source.

    Args:
        profile: A source that contains a tab-separated taxonomic profile generated
            by ganon.

    Returns:
        A data frame representation of the ganon profile.

    """
    result = pd.read_table(
        filepath_or_buffer=profile,
        sep="\t",
        header=None,
        index_col=False,
        skipinitialspace=True,
    )
    cls._check_num_columns(result, GanonProfile)
    result.columns = [
        GanonProfile.rank,
        GanonProfile.target,
        GanonProfile.lineage,
        GanonProfile.name,
        GanonProfile.number_unique,
        GanonProfile.number_shared,
        GanonProfile.number_children,
        GanonProfile.number_cumulative,
        GanonProfile.percent_cumulative,
    ]
    return result

Functions¶

ganon_profile_standardisation_service ¶

Provide a standardisation service for ganon profiles.

Attributes¶

logger = logging.getLogger(__name__) module-attribute ¶

Classes¶

GanonProfileStandardisationService ¶

Define a standardisation service for ganon profiles.

Source code in src/taxpasta/infrastructure/application/ganon/ganon_profile_standardisation_service.py

class GanonProfileStandardisationService(ProfileStandardisationService):
    """Define a standardisation service for ganon profiles."""

    @classmethod
    @pa.check_types(lazy=True)
    def transform(cls, profile: DataFrame[GanonProfile]) -> DataFrame[StandardProfile]:
        """
        Tidy up and standardize a given ganon profile.

        Args:
            profile: A taxonomic profile generated by ganon.

        Returns:
            A standardized profile.

        """
        # Select unclassified entries, rename columns, assign taxonomy ID zero, and
        #  sum up counts.
        unclassified = (
            profile.loc[
                profile[GanonProfile.target] == "-",
                [GanonProfile.target, GanonProfile.number_unique],
            ]
            .copy()
            .rename(
                columns={
                    GanonProfile.target: StandardProfile.taxonomy_id,
                    GanonProfile.number_unique: StandardProfile.count,
                }
            )
            .assign(**{StandardProfile.taxonomy_id: 0})
            .groupby(StandardProfile.taxonomy_id, as_index=False, sort=False)
            .sum()
        )
        # Select classified entries, rename columns, and convert taxonomy ID to integer.
        classified = (
            profile.loc[
                profile[GanonProfile.target] != "-",
                [GanonProfile.target, GanonProfile.number_unique],
            ]
            .copy()
            .rename(
                columns={
                    GanonProfile.target: StandardProfile.taxonomy_id,
                    GanonProfile.number_unique: StandardProfile.count,
                }
            )
            .assign(
                **{
                    StandardProfile.taxonomy_id: lambda df: df[
                        StandardProfile.taxonomy_id
                    ].astype(int)
                }
            )
        )
        return pd.concat([unclassified, classified], ignore_index=True)

Functions¶

transform(profile: DataFrame[GanonProfile]) -> DataFrame[StandardProfile] classmethod ¶

Tidy up and standardize a given ganon profile.

Parameters:

Name	Type	Description	Default
`profile`	`DataFrame[GanonProfile]`	A taxonomic profile generated by ganon.	required

Returns:

Type	Description
`DataFrame[StandardProfile]`	A standardized profile.

Source code in src/taxpasta/infrastructure/application/ganon/ganon_profile_standardisation_service.py

@classmethod
@pa.check_types(lazy=True)
def transform(cls, profile: DataFrame[GanonProfile]) -> DataFrame[StandardProfile]:
    """
    Tidy up and standardize a given ganon profile.

    Args:
        profile: A taxonomic profile generated by ganon.

    Returns:
        A standardized profile.

    """
    # Select unclassified entries, rename columns, assign taxonomy ID zero, and
    #  sum up counts.
    unclassified = (
        profile.loc[
            profile[GanonProfile.target] == "-",
            [GanonProfile.target, GanonProfile.number_unique],
        ]
        .copy()
        .rename(
            columns={
                GanonProfile.target: StandardProfile.taxonomy_id,
                GanonProfile.number_unique: StandardProfile.count,
            }
        )
        .assign(**{StandardProfile.taxonomy_id: 0})
        .groupby(StandardProfile.taxonomy_id, as_index=False, sort=False)
        .sum()
    )
    # Select classified entries, rename columns, and convert taxonomy ID to integer.
    classified = (
        profile.loc[
            profile[GanonProfile.target] != "-",
            [GanonProfile.target, GanonProfile.number_unique],
        ]
        .copy()
        .rename(
            columns={
                GanonProfile.target: StandardProfile.taxonomy_id,
                GanonProfile.number_unique: StandardProfile.count,
            }
        )
        .assign(
            **{
                StandardProfile.taxonomy_id: lambda df: df[
                    StandardProfile.taxonomy_id
                ].astype(int)
            }
        )
    )
    return pd.concat([unclassified, classified], ignore_index=True)

`kaiju` ¶

Classes¶

Modules¶

kaiju_profile ¶

Provide a description of the kaiju profile format.

Attributes¶

KAIJU_PERCENT_TOLERANCE = 1.0 module-attribute ¶

KAIJU_PERCENT_TOTAL = 100.0 module-attribute ¶

Classes¶

KaijuProfile ¶

Bases: BaseDataFrameModel

Define the expected kaiju profile format.

Source code in src/taxpasta/infrastructure/application/kaiju/kaiju_profile.py

class KaijuProfile(BaseDataFrameModel):
    """Define the expected kaiju profile format."""

    file: Series[str] = pa.Field()
    percent: Series[float] = pa.Field(ge=0.0, le=100.0)
    reads: Series[int] = pa.Field(ge=0)
    taxon_id: Series[pd.Int64Dtype] = pa.Field(nullable=True)
    taxon_name: Series[str] = pa.Field()

    @pa.check("percent", name="compositionality", raise_warning=True)
    def check_compositionality(cls, percent: Series[float]) -> bool:
        """Check that the percentages add up to a hundred."""
        # Kaiju reports percentages with sixth decimals
        return percent.empty or bool(
            np.isclose(percent.sum(), KAIJU_PERCENT_TOTAL, atol=KAIJU_PERCENT_TOLERANCE)
        )

    @pa.check("file", name="unique_filename")
    def check_unique_filename(cls, file_col: Series[str]) -> bool:
        """Check that Kaiju filename is unique."""
        return file_col.empty or file_col.nunique() == 1

Attributes¶

file: Series[str] = pa.Field() class-attribute instance-attribute ¶

percent: Series[float] = pa.Field(ge=0.0, le=100.0) class-attribute instance-attribute ¶

reads: Series[int] = pa.Field(ge=0) class-attribute instance-attribute ¶

taxon_id: Series[pd.Int64Dtype] = pa.Field(nullable=True) class-attribute instance-attribute ¶

taxon_name: Series[str] = pa.Field() class-attribute instance-attribute ¶

Functions¶

check_compositionality(percent: Series[float]) -> bool ¶

Check that the percentages add up to a hundred.

Source code in src/taxpasta/infrastructure/application/kaiju/kaiju_profile.py

@pa.check("percent", name="compositionality", raise_warning=True)
def check_compositionality(cls, percent: Series[float]) -> bool:
    """Check that the percentages add up to a hundred."""
    # Kaiju reports percentages with sixth decimals
    return percent.empty or bool(
        np.isclose(percent.sum(), KAIJU_PERCENT_TOTAL, atol=KAIJU_PERCENT_TOLERANCE)
    )

check_unique_filename(file_col: Series[str]) -> bool ¶

Check that Kaiju filename is unique.

Source code in src/taxpasta/infrastructure/application/kaiju/kaiju_profile.py

@pa.check("file", name="unique_filename")
def check_unique_filename(cls, file_col: Series[str]) -> bool:
    """Check that Kaiju filename is unique."""
    return file_col.empty or file_col.nunique() == 1

kaiju_profile_reader ¶

Provide a reader for kaiju profiles.

Attributes¶ Classes¶

KaijuProfileReader ¶

Bases: ProfileReader

Define a reader for kaiju profiles.

Source code in src/taxpasta/infrastructure/application/kaiju/kaiju_profile_reader.py

class KaijuProfileReader(ProfileReader):
    """Define a reader for kaiju profiles."""

    @classmethod
    @raise_parser_warnings
    def read(cls, profile: BufferOrFilepath) -> DataFrame[KaijuProfile]:
        """
        Read a kaiju taxonomic profile from the given source.

        Args:
            profile: A source that contains a tab-separated taxonomic profile generated
                by kaiju.

        Returns:
            A data frame representation of the kaiju profile.

        """
        result = pd.read_table(
            filepath_or_buffer=profile,
            sep="\t",
            header=0,
            index_col=False,
            dtype={KaijuProfile.taxon_id: "Int64"},
        )
        cls._check_num_columns(result, KaijuProfile)
        return result

Functions¶

read(profile: BufferOrFilepath) -> DataFrame[KaijuProfile] classmethod ¶

Read a kaiju taxonomic profile from the given source.

Parameters:

Name	Type	Description	Default
`profile`	`BufferOrFilepath`	A source that contains a tab-separated taxonomic profile generated by kaiju.	required

Returns:

Type	Description
`DataFrame[KaijuProfile]`	A data frame representation of the kaiju profile.

Source code in src/taxpasta/infrastructure/application/kaiju/kaiju_profile_reader.py

@classmethod
@raise_parser_warnings
def read(cls, profile: BufferOrFilepath) -> DataFrame[KaijuProfile]:
    """
    Read a kaiju taxonomic profile from the given source.

    Args:
        profile: A source that contains a tab-separated taxonomic profile generated
            by kaiju.

    Returns:
        A data frame representation of the kaiju profile.

    """
    result = pd.read_table(
        filepath_or_buffer=profile,
        sep="\t",
        header=0,
        index_col=False,
        dtype={KaijuProfile.taxon_id: "Int64"},
    )
    cls._check_num_columns(result, KaijuProfile)
    return result

Functions¶

kaiju_profile_standardisation_service ¶

Provide a standardisation service for kaiju profiles.

Classes¶

KaijuProfileStandardisationService ¶

Define a standardisation service for kaiju profiles.

Source code in src/taxpasta/infrastructure/application/kaiju/kaiju_profile_standardisation_service.py

class KaijuProfileStandardisationService(ProfileStandardisationService):
    """Define a standardisation service for kaiju profiles."""

    @classmethod
    @pa.check_types(lazy=True)
    def transform(cls, profile: DataFrame[KaijuProfile]) -> DataFrame[StandardProfile]:
        """
        Tidy up and standardize a given kaiju profile.

        Args:
            profile: A taxonomic profile generated by kaiju.

        Returns:
            A standardized profile.

        """
        temp = (
            profile[[KaijuProfile.taxon_id, KaijuProfile.reads]]
            .copy()
            .rename(
                columns={
                    KaijuProfile.taxon_id: StandardProfile.taxonomy_id,
                    KaijuProfile.reads: StandardProfile.count,
                }
            )
        )
        result = temp.loc[temp[StandardProfile.taxonomy_id].notna(), :].copy()
        result[StandardProfile.taxonomy_id] = result[
            StandardProfile.taxonomy_id
        ].astype(int)
        # Replace missing values (unclassified reads) with ID zero and sum reads.
        return pd.concat(
            [
                result,
                pd.DataFrame(
                    {
                        StandardProfile.taxonomy_id: [0],
                        StandardProfile.count: [
                            temp.loc[
                                temp[StandardProfile.taxonomy_id].isna(),
                                StandardProfile.count,
                            ].sum()
                        ],
                    },
                    dtype=int,
                ),
            ],
            ignore_index=True,
        )

Functions¶

transform(profile: DataFrame[KaijuProfile]) -> DataFrame[StandardProfile] classmethod ¶

Tidy up and standardize a given kaiju profile.

Parameters:

Name	Type	Description	Default
`profile`	`DataFrame[KaijuProfile]`	A taxonomic profile generated by kaiju.	required

Returns:

Type	Description
`DataFrame[StandardProfile]`	A standardized profile.

Source code in src/taxpasta/infrastructure/application/kaiju/kaiju_profile_standardisation_service.py

@classmethod
@pa.check_types(lazy=True)
def transform(cls, profile: DataFrame[KaijuProfile]) -> DataFrame[StandardProfile]:
    """
    Tidy up and standardize a given kaiju profile.

    Args:
        profile: A taxonomic profile generated by kaiju.

    Returns:
        A standardized profile.

    """
    temp = (
        profile[[KaijuProfile.taxon_id, KaijuProfile.reads]]
        .copy()
        .rename(
            columns={
                KaijuProfile.taxon_id: StandardProfile.taxonomy_id,
                KaijuProfile.reads: StandardProfile.count,
            }
        )
    )
    result = temp.loc[temp[StandardProfile.taxonomy_id].notna(), :].copy()
    result[StandardProfile.taxonomy_id] = result[
        StandardProfile.taxonomy_id
    ].astype(int)
    # Replace missing values (unclassified reads) with ID zero and sum reads.
    return pd.concat(
        [
            result,
            pd.DataFrame(
                {
                    StandardProfile.taxonomy_id: [0],
                    StandardProfile.count: [
                        temp.loc[
                            temp[StandardProfile.taxonomy_id].isna(),
                            StandardProfile.count,
                        ].sum()
                    ],
                },
                dtype=int,
            ),
        ],
        ignore_index=True,
    )

`kmcp` ¶

Classes¶

Modules¶

kmcp_profile ¶

Provide a description of the KMCP profile format.

Attributes¶

KMCP_PERCENT_TOLERANCE = 1.0 module-attribute ¶

KMCP_PERCENT_TOTAL = 100.0 module-attribute ¶

Classes¶

KMCPProfile ¶

Bases: BaseDataFrameModel

Define the expected KMCP profile format.

Source code in src/taxpasta/infrastructure/application/kmcp/kmcp_profile.py

class KMCPProfile(BaseDataFrameModel):
    """Define the expected KMCP profile format."""

    reference: Series[str] = pa.Field(alias="ref")
    percentage: Series[float] = pa.Field(ge=0.0, le=100.0)
    coverage: Series[float] = pa.Field(ge=0.0, nullable=True)
    score: Series[float] = pa.Field(ge=0.0, le=100.0)
    chunks_fraction: Series[float] = pa.Field(ge=0.0, le=1.0, alias="chunksFrac")
    chunks_relative_depth: Series[str] = pa.Field(alias="chunksRelDepth")
    chunks_relative_depth_std: Series[float] = pa.Field(
        ge=0.0, nullable=True, alias="chunksRelDepthStd"
    )
    reads: Series[int] = pa.Field(ge=0)
    unique_reads: Series[int] = pa.Field(ge=0, alias="ureads")
    high_confidence_unique_reads: Series[int] = pa.Field(ge=0, alias="hicureads")
    reference_size: Series[int] = pa.Field(ge=0, alias="refsize")
    reference_name: Series[str] = pa.Field(nullable=True, alias="refname")
    taxid: Series[int] = pa.Field(ge=0)
    rank: Series[str] = pa.Field(nullable=True)
    taxonomic_name: Series[str] = pa.Field(nullable=True, alias="taxname")
    taxonomic_path: Series[str] = pa.Field(nullable=True, alias="taxpath")
    taxonomic_path_lineage: Series[str] = pa.Field(nullable=True, alias="taxpathsn")

    @pa.check("percentage", name="compositionality", raise_warning=True)
    def check_compositionality(cls, percentage: Series[float]) -> bool:
        """Check that the percentages add up to a hundred."""
        # KMCP profile reports percentages with sixth decimals
        return percentage.empty or bool(
            np.isclose(
                percentage.sum(), KMCP_PERCENT_TOTAL, atol=KMCP_PERCENT_TOLERANCE
            )
        )

Attributes¶

chunks_fraction: Series[float] = pa.Field(ge=0.0, le=1.0, alias='chunksFrac') class-attribute instance-attribute ¶

chunks_relative_depth: Series[str] = pa.Field(alias='chunksRelDepth') class-attribute instance-attribute ¶

chunks_relative_depth_std: Series[float] = pa.Field(ge=0.0, nullable=True, alias='chunksRelDepthStd') class-attribute instance-attribute ¶

coverage: Series[float] = pa.Field(ge=0.0, nullable=True) class-attribute instance-attribute ¶

high_confidence_unique_reads: Series[int] = pa.Field(ge=0, alias='hicureads') class-attribute instance-attribute ¶

percentage: Series[float] = pa.Field(ge=0.0, le=100.0) class-attribute instance-attribute ¶

rank: Series[str] = pa.Field(nullable=True) class-attribute instance-attribute ¶

reads: Series[int] = pa.Field(ge=0) class-attribute instance-attribute ¶

reference: Series[str] = pa.Field(alias='ref') class-attribute instance-attribute ¶

reference_name: Series[str] = pa.Field(nullable=True, alias='refname') class-attribute instance-attribute ¶

reference_size: Series[int] = pa.Field(ge=0, alias='refsize') class-attribute instance-attribute ¶

score: Series[float] = pa.Field(ge=0.0, le=100.0) class-attribute instance-attribute ¶

taxid: Series[int] = pa.Field(ge=0) class-attribute instance-attribute ¶

taxonomic_name: Series[str] = pa.Field(nullable=True, alias='taxname') class-attribute instance-attribute ¶

taxonomic_path: Series[str] = pa.Field(nullable=True, alias='taxpath') class-attribute instance-attribute ¶

taxonomic_path_lineage: Series[str] = pa.Field(nullable=True, alias='taxpathsn') class-attribute instance-attribute ¶

unique_reads: Series[int] = pa.Field(ge=0, alias='ureads') class-attribute instance-attribute ¶

Functions¶

check_compositionality(percentage: Series[float]) -> bool ¶

Check that the percentages add up to a hundred.

Source code in src/taxpasta/infrastructure/application/kmcp/kmcp_profile.py

@pa.check("percentage", name="compositionality", raise_warning=True)
def check_compositionality(cls, percentage: Series[float]) -> bool:
    """Check that the percentages add up to a hundred."""
    # KMCP profile reports percentages with sixth decimals
    return percentage.empty or bool(
        np.isclose(
            percentage.sum(), KMCP_PERCENT_TOTAL, atol=KMCP_PERCENT_TOLERANCE
        )
    )

kmcp_profile_reader ¶

Provide a reader for KMCP profiles.

Attributes¶ Classes¶

KMCPProfileReader ¶

Bases: ProfileReader

Define a reader for KMCP profiles.

Source code in src/taxpasta/infrastructure/application/kmcp/kmcp_profile_reader.py

class KMCPProfileReader(ProfileReader):
    """Define a reader for KMCP profiles."""

    @classmethod
    @raise_parser_warnings
    def read(cls, profile: BufferOrFilepath) -> DataFrame[KMCPProfile]:
        """
        Read a KMCP taxonomic profile from the given source.

        Args:
            profile: A source that contains a tab-separated taxonomic profile generated
                by KMCP.

        Returns:
            A data frame representation of the KMCP profile.

        """
        result = pd.read_table(
            filepath_or_buffer=profile,
            sep="\t",
            header=0,
            index_col=False,
            dtype={
                KMCPProfile.chunks_relative_depth: str,
            },
        )
        cls._check_num_columns(result, KMCPProfile)
        return result

Functions¶

read(profile: BufferOrFilepath) -> DataFrame[KMCPProfile] classmethod ¶

Read a KMCP taxonomic profile from the given source.

Parameters:

Name	Type	Description	Default
`profile`	`BufferOrFilepath`	A source that contains a tab-separated taxonomic profile generated by KMCP.	required

Returns:

Type	Description
`DataFrame[KMCPProfile]`	A data frame representation of the KMCP profile.

Source code in src/taxpasta/infrastructure/application/kmcp/kmcp_profile_reader.py

@classmethod
@raise_parser_warnings
def read(cls, profile: BufferOrFilepath) -> DataFrame[KMCPProfile]:
    """
    Read a KMCP taxonomic profile from the given source.

    Args:
        profile: A source that contains a tab-separated taxonomic profile generated
            by KMCP.

    Returns:
        A data frame representation of the KMCP profile.

    """
    result = pd.read_table(
        filepath_or_buffer=profile,
        sep="\t",
        header=0,
        index_col=False,
        dtype={
            KMCPProfile.chunks_relative_depth: str,
        },
    )
    cls._check_num_columns(result, KMCPProfile)
    return result

Functions¶

kmcp_profile_standardisation_service ¶

Provide a standardisation service for KMCP profiles.

Attributes¶

logger = logging.getLogger(__name__) module-attribute ¶

Classes¶

KMCPProfileStandardisationService ¶

Define a standardisation service for KMCP profiles.

Source code in src/taxpasta/infrastructure/application/kmcp/kmcp_profile_standardisation_service.py

class KMCPProfileStandardisationService(ProfileStandardisationService):
    """Define a standardisation service for KMCP profiles."""

    @classmethod
    @pa.check_types(lazy=True)
    def transform(cls, profile: DataFrame[KMCPProfile]) -> DataFrame[StandardProfile]:
        """
        Tidy up and standardize a given KMCP profile.

        Args:
            profile: A taxonomic profile generated by KMCP.

        Returns:
            A standardized profile.

        """
        temp = (
            profile[[KMCPProfile.taxid, KMCPProfile.reads]]
            .copy()
            .rename(
                columns={
                    KMCPProfile.taxid: StandardProfile.taxonomy_id,
                    KMCPProfile.reads: StandardProfile.count,
                }
            )
        )
        result = temp.loc[temp[StandardProfile.taxonomy_id].notna(), :].copy()
        result[StandardProfile.taxonomy_id] = result[
            StandardProfile.taxonomy_id
        ].astype(int)
        # Replace missing values (unclassified reads) with ID zero and sum reads.
        return pd.concat(
            [
                result,
                pd.DataFrame(
                    {
                        StandardProfile.taxonomy_id: [0],
                        StandardProfile.count: [
                            temp.loc[
                                temp[StandardProfile.taxonomy_id].isna(),
                                StandardProfile.count,
                            ].sum()
                        ],
                    },
                    dtype=int,
                ),
            ],
            ignore_index=True,
        )

Functions¶

transform(profile: DataFrame[KMCPProfile]) -> DataFrame[StandardProfile] classmethod ¶

Tidy up and standardize a given KMCP profile.

Parameters:

Name	Type	Description	Default
`profile`	`DataFrame[KMCPProfile]`	A taxonomic profile generated by KMCP.	required

Returns:

Type	Description
`DataFrame[StandardProfile]`	A standardized profile.

Source code in src/taxpasta/infrastructure/application/kmcp/kmcp_profile_standardisation_service.py

@classmethod
@pa.check_types(lazy=True)
def transform(cls, profile: DataFrame[KMCPProfile]) -> DataFrame[StandardProfile]:
    """
    Tidy up and standardize a given KMCP profile.

    Args:
        profile: A taxonomic profile generated by KMCP.

    Returns:
        A standardized profile.

    """
    temp = (
        profile[[KMCPProfile.taxid, KMCPProfile.reads]]
        .copy()
        .rename(
            columns={
                KMCPProfile.taxid: StandardProfile.taxonomy_id,
                KMCPProfile.reads: StandardProfile.count,
            }
        )
    )
    result = temp.loc[temp[StandardProfile.taxonomy_id].notna(), :].copy()
    result[StandardProfile.taxonomy_id] = result[
        StandardProfile.taxonomy_id
    ].astype(int)
    # Replace missing values (unclassified reads) with ID zero and sum reads.
    return pd.concat(
        [
            result,
            pd.DataFrame(
                {
                    StandardProfile.taxonomy_id: [0],
                    StandardProfile.count: [
                        temp.loc[
                            temp[StandardProfile.taxonomy_id].isna(),
                            StandardProfile.count,
                        ].sum()
                    ],
                },
                dtype=int,
            ),
        ],
        ignore_index=True,
    )

`kraken2` ¶

Classes¶

Modules¶

kraken2_profile ¶

Provide a description of the kraken2 profile format.

Attributes¶

KRAKEN2_PERCENT_TOLERANCE = 1.0 module-attribute ¶

KRAKEN2_PERCENT_TOTAL = 100.0 module-attribute ¶

Classes¶

Kraken2Profile ¶

Bases: BaseDataFrameModel

Define the expected kraken2 profile format.

Source code in src/taxpasta/infrastructure/application/kraken2/kraken2_profile.py

class Kraken2Profile(BaseDataFrameModel):
    """Define the expected kraken2 profile format."""

    percent: Series[float] = pa.Field(ge=0.0, le=100.0)
    clade_assigned_reads: Series[int] = pa.Field(ge=0)
    direct_assigned_reads: Series[int] = pa.Field(ge=0)
    num_minimizers: Optional[Series[int]] = pa.Field(ge=0)
    distinct_minimizers: Optional[Series[int]] = pa.Field(ge=0)
    taxonomy_lvl: Series[str] = pa.Field()
    taxonomy_id: Series[int] = pa.Field(ge=0)
    name: Series[str] = pa.Field()

    @pa.dataframe_check(name="compositionality", raise_warning=True)
    def check_compositionality(cls, profile: pd.DataFrame) -> bool:
        """Check that the percent of 'unclassified' and 'root' add up to a hundred."""
        # Kraken2 reports percentages only to the second decimal, so we expect
        # some deviation.
        # If 100% of reads are assigned, unclassified reads are not reported at all.
        return profile.empty or bool(
            np.isclose(
                profile.loc[
                    profile[cls.taxonomy_lvl].isin(["U", "R"]), cls.percent
                ].sum(),
                KRAKEN2_PERCENT_TOTAL,
                atol=KRAKEN2_PERCENT_TOLERANCE,
            )
        )

Attributes¶

clade_assigned_reads: Series[int] = pa.Field(ge=0) class-attribute instance-attribute ¶

direct_assigned_reads: Series[int] = pa.Field(ge=0) class-attribute instance-attribute ¶

distinct_minimizers: Optional[Series[int]] = pa.Field(ge=0) class-attribute instance-attribute ¶

name: Series[str] = pa.Field() class-attribute instance-attribute ¶

num_minimizers: Optional[Series[int]] = pa.Field(ge=0) class-attribute instance-attribute ¶

percent: Series[float] = pa.Field(ge=0.0, le=100.0) class-attribute instance-attribute ¶

taxonomy_id: Series[int] = pa.Field(ge=0) class-attribute instance-attribute ¶

taxonomy_lvl: Series[str] = pa.Field() class-attribute instance-attribute ¶

Functions¶

check_compositionality(profile: pd.DataFrame) -> bool ¶

Check that the percent of 'unclassified' and 'root' add up to a hundred.

Source code in src/taxpasta/infrastructure/application/kraken2/kraken2_profile.py

@pa.dataframe_check(name="compositionality", raise_warning=True)
def check_compositionality(cls, profile: pd.DataFrame) -> bool:
    """Check that the percent of 'unclassified' and 'root' add up to a hundred."""
    # Kraken2 reports percentages only to the second decimal, so we expect
    # some deviation.
    # If 100% of reads are assigned, unclassified reads are not reported at all.
    return profile.empty or bool(
        np.isclose(
            profile.loc[
                profile[cls.taxonomy_lvl].isin(["U", "R"]), cls.percent
            ].sum(),
            KRAKEN2_PERCENT_TOTAL,
            atol=KRAKEN2_PERCENT_TOLERANCE,
        )
    )

kraken2_profile_reader ¶

Provide a reader for kraken2 profiles.

Attributes¶ Classes¶

Kraken2ProfileReader ¶

Bases: ProfileReader

Define a reader for kraken2 profiles.

Source code in src/taxpasta/infrastructure/application/kraken2/kraken2_profile_reader.py

class Kraken2ProfileReader(ProfileReader):
    """Define a reader for kraken2 profiles."""

    @classmethod
    @raise_parser_warnings
    def read(cls, profile: BufferOrFilepath) -> DataFrame[Kraken2Profile]:
        """
        Read a kraken2 taxonomic profile from the given source.

        Args:
            profile: A source that contains a tab-separated taxonomic profile generated
                by kraken2.

        Returns:
            A data frame representation of the kraken2 profile.

        Raises:
            ValueError: In case the table does not contain exactly six or eight columns.

        """
        result = pd.read_table(
            filepath_or_buffer=profile,
            sep="\t",
            header=None,
            index_col=False,
            skipinitialspace=True,
        )
        if len(result.columns) == 6:
            result.columns = [
                Kraken2Profile.percent,
                Kraken2Profile.clade_assigned_reads,
                Kraken2Profile.direct_assigned_reads,
                Kraken2Profile.taxonomy_lvl,
                Kraken2Profile.taxonomy_id,
                Kraken2Profile.name,
            ]
        elif len(result.columns) == 8:
            result.columns = [
                Kraken2Profile.percent,
                Kraken2Profile.clade_assigned_reads,
                Kraken2Profile.direct_assigned_reads,
                Kraken2Profile.num_minimizers,
                Kraken2Profile.distinct_minimizers,
                Kraken2Profile.taxonomy_lvl,
                Kraken2Profile.taxonomy_id,
                Kraken2Profile.name,
            ]
        else:
            raise ValueError(
                f"Unexpected kraken2 report format. It has {len(result.columns)} "
                f"columns but only six or eight are expected."
            )
        return result

Functions¶

read(profile: BufferOrFilepath) -> DataFrame[Kraken2Profile] classmethod ¶

Read a kraken2 taxonomic profile from the given source.

Parameters:

Name	Type	Description	Default
`profile`	`BufferOrFilepath`	A source that contains a tab-separated taxonomic profile generated by kraken2.	required

Returns:

Type	Description
`DataFrame[Kraken2Profile]`	A data frame representation of the kraken2 profile.

Raises:

Type	Description
`ValueError`	In case the table does not contain exactly six or eight columns.

Source code in src/taxpasta/infrastructure/application/kraken2/kraken2_profile_reader.py

@classmethod
@raise_parser_warnings
def read(cls, profile: BufferOrFilepath) -> DataFrame[Kraken2Profile]:
    """
    Read a kraken2 taxonomic profile from the given source.

    Args:
        profile: A source that contains a tab-separated taxonomic profile generated
            by kraken2.

    Returns:
        A data frame representation of the kraken2 profile.

    Raises:
        ValueError: In case the table does not contain exactly six or eight columns.

    """
    result = pd.read_table(
        filepath_or_buffer=profile,
        sep="\t",
        header=None,
        index_col=False,
        skipinitialspace=True,
    )
    if len(result.columns) == 6:
        result.columns = [
            Kraken2Profile.percent,
            Kraken2Profile.clade_assigned_reads,
            Kraken2Profile.direct_assigned_reads,
            Kraken2Profile.taxonomy_lvl,
            Kraken2Profile.taxonomy_id,
            Kraken2Profile.name,
        ]
    elif len(result.columns) == 8:
        result.columns = [
            Kraken2Profile.percent,
            Kraken2Profile.clade_assigned_reads,
            Kraken2Profile.direct_assigned_reads,
            Kraken2Profile.num_minimizers,
            Kraken2Profile.distinct_minimizers,
            Kraken2Profile.taxonomy_lvl,
            Kraken2Profile.taxonomy_id,
            Kraken2Profile.name,
        ]
    else:
        raise ValueError(
            f"Unexpected kraken2 report format. It has {len(result.columns)} "
            f"columns but only six or eight are expected."
        )
    return result

Functions¶

kraken2_profile_standardisation_service ¶

Provide a standardisation service for kraken2 profiles.

Classes¶

Kraken2ProfileStandardisationService ¶

Define a standardisation service for kraken2 profiles.

Source code in src/taxpasta/infrastructure/application/kraken2/kraken2_profile_standardisation_service.py

class Kraken2ProfileStandardisationService(ProfileStandardisationService):
    """Define a standardisation service for kraken2 profiles."""

    @classmethod
    @pa.check_types(lazy=True)
    def transform(
        cls, profile: DataFrame[Kraken2Profile]
    ) -> DataFrame[StandardProfile]:
        """
        Tidy up and standardize a given kraken2 profile.

        Args:
            profile: A taxonomic profile generated by kraken2.

        Returns:
            A standardized profile.

        """
        return (
            profile[[Kraken2Profile.taxonomy_id, Kraken2Profile.direct_assigned_reads]]
            .copy()
            .rename(
                columns={
                    Kraken2Profile.taxonomy_id: StandardProfile.taxonomy_id,
                    Kraken2Profile.direct_assigned_reads: StandardProfile.count,
                }
            )
        )

Functions¶

transform(profile: DataFrame[Kraken2Profile]) -> DataFrame[StandardProfile] classmethod ¶

Tidy up and standardize a given kraken2 profile.

Parameters:

Name	Type	Description	Default
`profile`	`DataFrame[Kraken2Profile]`	A taxonomic profile generated by kraken2.	required

Returns:

Type	Description
`DataFrame[StandardProfile]`	A standardized profile.

Source code in src/taxpasta/infrastructure/application/kraken2/kraken2_profile_standardisation_service.py

@classmethod
@pa.check_types(lazy=True)
def transform(
    cls, profile: DataFrame[Kraken2Profile]
) -> DataFrame[StandardProfile]:
    """
    Tidy up and standardize a given kraken2 profile.

    Args:
        profile: A taxonomic profile generated by kraken2.

    Returns:
        A standardized profile.

    """
    return (
        profile[[Kraken2Profile.taxonomy_id, Kraken2Profile.direct_assigned_reads]]
        .copy()
        .rename(
            columns={
                Kraken2Profile.taxonomy_id: StandardProfile.taxonomy_id,
                Kraken2Profile.direct_assigned_reads: StandardProfile.count,
            }
        )
    )

`krakenuniq` ¶

Classes¶

Modules¶

krakenuniq_profile ¶

Provide a description of the KrakenUniq profile format.

Classes¶

KrakenUniqProfile ¶

Bases: BaseDataFrameModel

Define the expected KrakenUniq profile format.

Source code in src/taxpasta/infrastructure/application/krakenuniq/krakenuniq_profile.py

class KrakenUniqProfile(BaseDataFrameModel):
    """Define the expected KrakenUniq profile format."""

    percent: Series[float] = pa.Field(ge=0.0, le=100.0, alias="%")
    reads: Series[int] = pa.Field(ge=0)
    tax_reads: Series[int] = pa.Field(ge=0, alias="taxReads")
    kmers: Series[int] = pa.Field(ge=0)
    duplicates: Series[float] = pa.Field(ge=0.0, alias="dup")
    coverage: Series[float] = pa.Field(ge=0.0, nullable=True, alias="cov")
    tax_id: Series[int] = pa.Field(alias="taxID", ge=0)
    rank: Series[str] = pa.Field()
    tax_name: Series[str] = pa.Field(alias="taxName")

Attributes¶

coverage: Series[float] = pa.Field(ge=0.0, nullable=True, alias='cov') class-attribute instance-attribute ¶

duplicates: Series[float] = pa.Field(ge=0.0, alias='dup') class-attribute instance-attribute ¶

kmers: Series[int] = pa.Field(ge=0) class-attribute instance-attribute ¶

percent: Series[float] = pa.Field(ge=0.0, le=100.0, alias='%') class-attribute instance-attribute ¶

rank: Series[str] = pa.Field() class-attribute instance-attribute ¶

reads: Series[int] = pa.Field(ge=0) class-attribute instance-attribute ¶

tax_id: Series[int] = pa.Field(alias='taxID', ge=0) class-attribute instance-attribute ¶

tax_name: Series[str] = pa.Field(alias='taxName') class-attribute instance-attribute ¶

tax_reads: Series[int] = pa.Field(ge=0, alias='taxReads') class-attribute instance-attribute ¶

krakenuniq_profile_reader ¶

Provide a reader for KrakenUniq profiles.

Attributes¶ Classes¶

KrakenUniqProfileReader ¶

Bases: ProfileReader

Define a reader for KrakenUniq profiles.

Source code in src/taxpasta/infrastructure/application/krakenuniq/krakenuniq_profile_reader.py

class KrakenUniqProfileReader(ProfileReader):
    """Define a reader for KrakenUniq profiles."""

    @classmethod
    @raise_parser_warnings
    def read(cls, profile: BufferOrFilepath) -> DataFrame[KrakenUniqProfile]:
        """
        Read a krakenUniq taxonomic profile from the given source.

        Args:
            profile: A source that contains a tab-separated taxonomic profile generated
                by KrakenUniq.

        Returns:
            A data frame representation of the KrakenUniq profile.

        """
        result = pd.read_table(
            filepath_or_buffer=profile,
            sep="\t",
            skiprows=2,
            header=0,
            index_col=False,
            skipinitialspace=True,
            dtype={
                KrakenUniqProfile.percent: float,
                KrakenUniqProfile.duplicates: float,
                KrakenUniqProfile.coverage: float,
            },
        )
        cls._check_num_columns(result, KrakenUniqProfile)
        return result

Functions¶

read(profile: BufferOrFilepath) -> DataFrame[KrakenUniqProfile] classmethod ¶

Read a krakenUniq taxonomic profile from the given source.

Parameters:

Name	Type	Description	Default
`profile`	`BufferOrFilepath`	A source that contains a tab-separated taxonomic profile generated by KrakenUniq.	required

Returns:

Type	Description
`DataFrame[KrakenUniqProfile]`	A data frame representation of the KrakenUniq profile.

Source code in src/taxpasta/infrastructure/application/krakenuniq/krakenuniq_profile_reader.py

@classmethod
@raise_parser_warnings
def read(cls, profile: BufferOrFilepath) -> DataFrame[KrakenUniqProfile]:
    """
    Read a krakenUniq taxonomic profile from the given source.

    Args:
        profile: A source that contains a tab-separated taxonomic profile generated
            by KrakenUniq.

    Returns:
        A data frame representation of the KrakenUniq profile.

    """
    result = pd.read_table(
        filepath_or_buffer=profile,
        sep="\t",
        skiprows=2,
        header=0,
        index_col=False,
        skipinitialspace=True,
        dtype={
            KrakenUniqProfile.percent: float,
            KrakenUniqProfile.duplicates: float,
            KrakenUniqProfile.coverage: float,
        },
    )
    cls._check_num_columns(result, KrakenUniqProfile)
    return result

Functions¶

krakenuniq_profile_standardisation_service ¶

Provide a standardisation service for KrakenUniq profiles.

Classes¶

KrakenUniqProfileStandardisationService ¶

Define a standardisation service for krakenUniq profiles.

Source code in src/taxpasta/infrastructure/application/krakenuniq/krakenuniq_profile_standardisation_service.py

class KrakenUniqProfileStandardisationService(ProfileStandardisationService):
    """Define a standardisation service for krakenUniq profiles."""

    @classmethod
    @pa.check_types(lazy=True)
    def transform(
        cls, profile: DataFrame[KrakenUniqProfile]
    ) -> DataFrame[StandardProfile]:
        """
        Tidy up and standardize a given krakenUniq profile.

        Args:
            profile: A taxonomic profile generated by KrakenUniq.

        Returns:
            A standardized profile.

        """
        return (
            profile[[KrakenUniqProfile.tax_id, KrakenUniqProfile.tax_reads]]
            .copy()
            .rename(
                columns={
                    KrakenUniqProfile.tax_id: StandardProfile.taxonomy_id,
                    KrakenUniqProfile.tax_reads: StandardProfile.count,
                }
            )
        )

Functions¶

transform(profile: DataFrame[KrakenUniqProfile]) -> DataFrame[StandardProfile] classmethod ¶

Tidy up and standardize a given krakenUniq profile.

Parameters:

Name	Type	Description	Default
`profile`	`DataFrame[KrakenUniqProfile]`	A taxonomic profile generated by KrakenUniq.	required

Returns:

Type	Description
`DataFrame[StandardProfile]`	A standardized profile.

Source code in src/taxpasta/infrastructure/application/krakenuniq/krakenuniq_profile_standardisation_service.py

@classmethod
@pa.check_types(lazy=True)
def transform(
    cls, profile: DataFrame[KrakenUniqProfile]
) -> DataFrame[StandardProfile]:
    """
    Tidy up and standardize a given krakenUniq profile.

    Args:
        profile: A taxonomic profile generated by KrakenUniq.

    Returns:
        A standardized profile.

    """
    return (
        profile[[KrakenUniqProfile.tax_id, KrakenUniqProfile.tax_reads]]
        .copy()
        .rename(
            columns={
                KrakenUniqProfile.tax_id: StandardProfile.taxonomy_id,
                KrakenUniqProfile.tax_reads: StandardProfile.count,
            }
        )
    )

`megan6` ¶

Classes¶

Modules¶

megan6_profile ¶

Provide a description of the MEGAN6 rma2info profile format.

Classes¶

Megan6Profile ¶

Bases: BaseDataFrameModel

Define the expected MEGAN6 rma2info profile format.

Source code in src/taxpasta/infrastructure/application/megan6/megan6_profile.py

class Megan6Profile(BaseDataFrameModel):
    """Define the expected MEGAN6 rma2info profile format."""

    taxonomy_id: Series[int] = pa.Field(ge=0)
    count: Series[float] = pa.Field(ge=0.0)

Attributes¶

count: Series[float] = pa.Field(ge=0.0) class-attribute instance-attribute ¶

taxonomy_id: Series[int] = pa.Field(ge=0) class-attribute instance-attribute ¶

megan6_profile_reader ¶

Provide a reader for megan6 profiles.

Attributes¶ Classes¶

Megan6ProfileReader ¶

Bases: ProfileReader

Define a reader for MEGAN6 rma2info profiles.

Source code in src/taxpasta/infrastructure/application/megan6/megan6_profile_reader.py

class Megan6ProfileReader(ProfileReader):
    """Define a reader for MEGAN6 rma2info profiles."""

    @classmethod
    @raise_parser_warnings
    def read(cls, profile: BufferOrFilepath) -> DataFrame[Megan6Profile]:
        """Read a MEGAN6 rma2info taxonomic profile from a file."""
        result = pd.read_table(
            filepath_or_buffer=profile,
            sep="\t",
            index_col=False,
            header=None,
        )
        cls._check_num_columns(result, Megan6Profile)
        result.columns = [Megan6Profile.taxonomy_id, Megan6Profile.count]
        return result

Functions¶

read(profile: BufferOrFilepath) -> DataFrame[Megan6Profile] classmethod ¶

Read a MEGAN6 rma2info taxonomic profile from a file.

Source code in src/taxpasta/infrastructure/application/megan6/megan6_profile_reader.py

@classmethod
@raise_parser_warnings
def read(cls, profile: BufferOrFilepath) -> DataFrame[Megan6Profile]:
    """Read a MEGAN6 rma2info taxonomic profile from a file."""
    result = pd.read_table(
        filepath_or_buffer=profile,
        sep="\t",
        index_col=False,
        header=None,
    )
    cls._check_num_columns(result, Megan6Profile)
    result.columns = [Megan6Profile.taxonomy_id, Megan6Profile.count]
    return result

Functions¶

megan6_profile_standardisation_service ¶

Provide a standardisation service for megan6 profiles.

Classes¶

Megan6ProfileStandardisationService ¶

Define a standardisation service for megan6 profiles.

Source code in src/taxpasta/infrastructure/application/megan6/megan6_profile_standardisation_service.py

class Megan6ProfileStandardisationService(ProfileStandardisationService):
    """Define a standardisation service for megan6 profiles."""

    @classmethod
    @pa.check_types(lazy=True)
    def transform(cls, profile: DataFrame[Megan6Profile]) -> DataFrame[StandardProfile]:
        """
        Tidy up and standardize a given MEGAN6 rma2info profile.

        Args:
            profile: A taxonomic profile generated by MEGAN6 rma2info.

        Returns:
            A standardized profile.

        """
        return (
            profile[[Megan6Profile.taxonomy_id, Megan6Profile.count]]
            .copy()
            .rename(
                columns={
                    Megan6Profile.taxonomy_id: StandardProfile.taxonomy_id,
                    Megan6Profile.count: StandardProfile.count,
                }
            )
        )

Functions¶

transform(profile: DataFrame[Megan6Profile]) -> DataFrame[StandardProfile] classmethod ¶

Tidy up and standardize a given MEGAN6 rma2info profile.

Parameters:

Name	Type	Description	Default
`profile`	`DataFrame[Megan6Profile]`	A taxonomic profile generated by MEGAN6 rma2info.	required

Returns:

Type	Description
`DataFrame[StandardProfile]`	A standardized profile.

Source code in src/taxpasta/infrastructure/application/megan6/megan6_profile_standardisation_service.py

@classmethod
@pa.check_types(lazy=True)
def transform(cls, profile: DataFrame[Megan6Profile]) -> DataFrame[StandardProfile]:
    """
    Tidy up and standardize a given MEGAN6 rma2info profile.

    Args:
        profile: A taxonomic profile generated by MEGAN6 rma2info.

    Returns:
        A standardized profile.

    """
    return (
        profile[[Megan6Profile.taxonomy_id, Megan6Profile.count]]
        .copy()
        .rename(
            columns={
                Megan6Profile.taxonomy_id: StandardProfile.taxonomy_id,
                Megan6Profile.count: StandardProfile.count,
            }
        )
    )

`metaphlan` ¶

Classes¶

Modules¶

metaphlan_profile ¶

Provide a description of the metaphlan profile format.

Attributes¶

METAPHLAN_PERCENT_TOLERANCE = 1.0 module-attribute ¶

METAPHLAN_PERCENT_TOTAL = 100.0 module-attribute ¶

Classes¶

MetaphlanProfile ¶

Bases: BaseDataFrameModel

Define the expected metaphlan profile format.

Source code in src/taxpasta/infrastructure/application/metaphlan/metaphlan_profile.py

class MetaphlanProfile(BaseDataFrameModel):
    """Define the expected metaphlan profile format."""

    clade_name: Series[str] = pa.Field()
    # MetaPhlan provides the full lineage of tax IDs in this field.
    ncbi_tax_id: Series[str] = pa.Field(alias="NCBI_tax_id")
    relative_abundance: Series[float] = pa.Field(ge=0.0, le=100.0)
    additional_species: Optional[Series[str]] = pa.Field(nullable=True)

    @pa.dataframe_check(name="compositionality", raise_warning=True)
    def check_compositionality(cls, profile: pd.DataFrame) -> bool:
        """Check that the percentages per rank add up to a hundred."""
        # Parse the rank from the given lineage.
        rank = profile[cls.clade_name].str.rsplit("|", n=1).str[-1].str[0]
        return profile.empty or bool(
            np.allclose(
                profile.groupby(rank, sort=False)[cls.relative_abundance].sum(),
                METAPHLAN_PERCENT_TOTAL,
                atol=METAPHLAN_PERCENT_TOLERANCE,
            )
        )

Attributes¶

additional_species: Optional[Series[str]] = pa.Field(nullable=True) class-attribute instance-attribute ¶

clade_name: Series[str] = pa.Field() class-attribute instance-attribute ¶

ncbi_tax_id: Series[str] = pa.Field(alias='NCBI_tax_id') class-attribute instance-attribute ¶

relative_abundance: Series[float] = pa.Field(ge=0.0, le=100.0) class-attribute instance-attribute ¶

Functions¶

check_compositionality(profile: pd.DataFrame) -> bool ¶

Check that the percentages per rank add up to a hundred.

Source code in src/taxpasta/infrastructure/application/metaphlan/metaphlan_profile.py

@pa.dataframe_check(name="compositionality", raise_warning=True)
def check_compositionality(cls, profile: pd.DataFrame) -> bool:
    """Check that the percentages per rank add up to a hundred."""
    # Parse the rank from the given lineage.
    rank = profile[cls.clade_name].str.rsplit("|", n=1).str[-1].str[0]
    return profile.empty or bool(
        np.allclose(
            profile.groupby(rank, sort=False)[cls.relative_abundance].sum(),
            METAPHLAN_PERCENT_TOTAL,
            atol=METAPHLAN_PERCENT_TOLERANCE,
        )
    )

metaphlan_profile_reader ¶

Provide a reader for metaphlan profiles.

Attributes¶ Classes¶

MetaphlanProfileReader ¶

Bases: ProfileReader

Define a reader for Metaphlan profiles.

Source code in src/taxpasta/infrastructure/application/metaphlan/metaphlan_profile_reader.py

class MetaphlanProfileReader(ProfileReader):
    """Define a reader for Metaphlan profiles."""

    @classmethod
    @raise_parser_warnings
    def read(cls, profile: BufferOrFilepath) -> DataFrame[MetaphlanProfile]:
        """Read a metaphlan taxonomic profile from a file."""
        num_header_lines = cls._detect_number_header_line(profile)
        result = pd.read_table(
            filepath_or_buffer=profile,
            sep="\t",
            skiprows=num_header_lines,
            header=None,
            index_col=False,
            dtype={1: str},
        )
        cls._check_num_columns(result, MetaphlanProfile)
        result.columns = [
            MetaphlanProfile.clade_name,
            MetaphlanProfile.ncbi_tax_id,
            MetaphlanProfile.relative_abundance,
            MetaphlanProfile.additional_species,
        ]
        return result

    @classmethod
    def _detect_number_header_line(cls, profile: BufferOrFilepath) -> int:
        """
        Detect the number of comment lines in the header of a MetaPhlAn profile.

        The number of lines varies at least between versions 3 & 4.

        """
        if isinstance(profile, BinaryIO):
            # We assume default file encoding here (UTF-8 in most environments).
            result = cls._detect_first_content_line(buffer=TextIOWrapper(profile))
            profile.seek(0)
            return result
        elif isinstance(profile, TextIO):
            result = cls._detect_first_content_line(buffer=profile)
            profile.seek(0)
            return result
        else:
            with Path(profile).open(mode="r") as handle:
                return cls._detect_first_content_line(buffer=handle)

    @classmethod
    def _detect_first_content_line(
        cls, buffer: TextIO, comment_marker: str = "#", max_lines: int = 10
    ) -> int:
        """Detect the first non-comment line in the given text buffer."""
        for num, line in enumerate(buffer):
            if not line.startswith(comment_marker):
                return num
            if num >= max_lines:
                raise ValueError(
                    "Unexpectedly large number of comment lines in MetaPhlAn "
                    "profile (>10)."
                )
        else:
            raise ValueError("Could not detect any content lines in MetaPhlAn profile.")

Functions¶

read(profile: BufferOrFilepath) -> DataFrame[MetaphlanProfile] classmethod ¶

Read a metaphlan taxonomic profile from a file.

Source code in src/taxpasta/infrastructure/application/metaphlan/metaphlan_profile_reader.py

@classmethod
@raise_parser_warnings
def read(cls, profile: BufferOrFilepath) -> DataFrame[MetaphlanProfile]:
    """Read a metaphlan taxonomic profile from a file."""
    num_header_lines = cls._detect_number_header_line(profile)
    result = pd.read_table(
        filepath_or_buffer=profile,
        sep="\t",
        skiprows=num_header_lines,
        header=None,
        index_col=False,
        dtype={1: str},
    )
    cls._check_num_columns(result, MetaphlanProfile)
    result.columns = [
        MetaphlanProfile.clade_name,
        MetaphlanProfile.ncbi_tax_id,
        MetaphlanProfile.relative_abundance,
        MetaphlanProfile.additional_species,
    ]
    return result

Functions¶

metaphlan_profile_standardisation_service ¶

Provide a standardisation service for metaphlan profiles.

Attributes¶

logger = logging.getLogger(__name__) module-attribute ¶

Classes¶

MetaphlanProfileStandardisationService ¶

Define a standardisation service for metaphlan profiles.

Source code in src/taxpasta/infrastructure/application/metaphlan/metaphlan_profile_standardisation_service.py

class MetaphlanProfileStandardisationService(ProfileStandardisationService):
    """Define a standardisation service for metaphlan profiles."""

    # Metaphlan only reports up to six decimals so this number should be large enough.
    LARGE_INTEGER = 1_000_000

    @classmethod
    @pa.check_types(lazy=True)
    def transform(
        cls, profile: DataFrame[MetaphlanProfile]
    ) -> DataFrame[StandardProfile]:
        """
        Tidy up and standardize a given metaphlan profile.

        Args:
            profile: A taxonomic profile generated by metaphlan.

        Returns:
            A standardized profile.

        """
        result = (
            profile[[MetaphlanProfile.ncbi_tax_id, MetaphlanProfile.relative_abundance]]
            .copy()
            .rename(
                columns={
                    MetaphlanProfile.ncbi_tax_id: StandardProfile.taxonomy_id,
                    MetaphlanProfile.relative_abundance: StandardProfile.count,
                }
            )
            .assign(
                **{
                    StandardProfile.taxonomy_id: lambda df: df[
                        StandardProfile.taxonomy_id
                    ]
                    .str.rsplit("|", n=1)
                    .str[-1],
                    StandardProfile.count: lambda df: df[StandardProfile.count]
                    * cls.LARGE_INTEGER,
                }
            )
            .assign(
                **{
                    StandardProfile.count: lambda df: df[StandardProfile.count].astype(
                        int
                    )
                }
            )
        )
        result[StandardProfile.taxonomy_id] = pd.to_numeric(
            result[StandardProfile.taxonomy_id], errors="coerce"
        ).astype("Int64")
        unclassified_mask = result[StandardProfile.taxonomy_id].isna() | (
            result[StandardProfile.taxonomy_id] == -1
        )
        num = int(unclassified_mask.sum())
        if num > 0:
            logger.warning(
                "Combining %d entries with unclassified taxa in the profile.", num
            )
        return pd.concat(
            [
                result.loc[~unclassified_mask, :],
                pd.DataFrame(
                    {
                        StandardProfile.taxonomy_id: [0],
                        StandardProfile.count: [
                            result.loc[unclassified_mask, StandardProfile.count].sum()
                        ],
                    },
                    dtype=int,
                ),
            ],
            ignore_index=True,
        )

Attributes¶

LARGE_INTEGER = 1000000 class-attribute instance-attribute ¶

Functions¶

transform(profile: DataFrame[MetaphlanProfile]) -> DataFrame[StandardProfile] classmethod ¶

Tidy up and standardize a given metaphlan profile.

Parameters:

Name	Type	Description	Default
`profile`	`DataFrame[MetaphlanProfile]`	A taxonomic profile generated by metaphlan.	required

Returns:

Type	Description
`DataFrame[StandardProfile]`	A standardized profile.

Source code in src/taxpasta/infrastructure/application/metaphlan/metaphlan_profile_standardisation_service.py

@classmethod
@pa.check_types(lazy=True)
def transform(
    cls, profile: DataFrame[MetaphlanProfile]
) -> DataFrame[StandardProfile]:
    """
    Tidy up and standardize a given metaphlan profile.

    Args:
        profile: A taxonomic profile generated by metaphlan.

    Returns:
        A standardized profile.

    """
    result = (
        profile[[MetaphlanProfile.ncbi_tax_id, MetaphlanProfile.relative_abundance]]
        .copy()
        .rename(
            columns={
                MetaphlanProfile.ncbi_tax_id: StandardProfile.taxonomy_id,
                MetaphlanProfile.relative_abundance: StandardProfile.count,
            }
        )
        .assign(
            **{
                StandardProfile.taxonomy_id: lambda df: df[
                    StandardProfile.taxonomy_id
                ]
                .str.rsplit("|", n=1)
                .str[-1],
                StandardProfile.count: lambda df: df[StandardProfile.count]
                * cls.LARGE_INTEGER,
            }
        )
        .assign(
            **{
                StandardProfile.count: lambda df: df[StandardProfile.count].astype(
                    int
                )
            }
        )
    )
    result[StandardProfile.taxonomy_id] = pd.to_numeric(
        result[StandardProfile.taxonomy_id], errors="coerce"
    ).astype("Int64")
    unclassified_mask = result[StandardProfile.taxonomy_id].isna() | (
        result[StandardProfile.taxonomy_id] == -1
    )
    num = int(unclassified_mask.sum())
    if num > 0:
        logger.warning(
            "Combining %d entries with unclassified taxa in the profile.", num
        )
    return pd.concat(
        [
            result.loc[~unclassified_mask, :],
            pd.DataFrame(
                {
                    StandardProfile.taxonomy_id: [0],
                    StandardProfile.count: [
                        result.loc[unclassified_mask, StandardProfile.count].sum()
                    ],
                },
                dtype=int,
            ),
        ],
        ignore_index=True,
    )

`motus` ¶

Classes¶

Modules¶

motus_profile ¶

Provide a description of the mOTUs profile format.

Classes¶

MotusProfile ¶

Bases: BaseDataFrameModel

Define the expected mOTUs profile format.

Source code in src/taxpasta/infrastructure/application/motus/motus_profile.py

class MotusProfile(BaseDataFrameModel):
    """Define the expected mOTUs profile format."""

    consensus_taxonomy: Series[str] = pa.Field()
    ncbi_tax_id: Series[pd.Int64Dtype] = pa.Field(nullable=True)
    read_count: Series[int] = pa.Field(ge=0)

Attributes¶

consensus_taxonomy: Series[str] = pa.Field() class-attribute instance-attribute ¶

ncbi_tax_id: Series[pd.Int64Dtype] = pa.Field(nullable=True) class-attribute instance-attribute ¶

read_count: Series[int] = pa.Field(ge=0) class-attribute instance-attribute ¶

motus_profile_reader ¶

Provide a reader for motus profiles.

Attributes¶ Classes¶

MotusProfileReader ¶

Bases: ProfileReader

Define a reader for mOTUS profiles.

Source code in src/taxpasta/infrastructure/application/motus/motus_profile_reader.py

class MotusProfileReader(ProfileReader):
    """Define a reader for mOTUS profiles."""

    @classmethod
    @raise_parser_warnings
    def read(cls, profile: BufferOrFilepath) -> DataFrame[MotusProfile]:
        """Read a mOTUs taxonomic profile from a file."""
        result = pd.read_table(
            filepath_or_buffer=profile,
            sep="\t",
            skiprows=3,
            header=None,
            index_col=False,
            dtype={1: "Int64"},
        )
        cls._check_num_columns(result, MotusProfile)
        result.columns = [
            MotusProfile.consensus_taxonomy,
            MotusProfile.ncbi_tax_id,
            MotusProfile.read_count,
        ]
        return result

Functions¶

read(profile: BufferOrFilepath) -> DataFrame[MotusProfile] classmethod ¶

Read a mOTUs taxonomic profile from a file.

Source code in src/taxpasta/infrastructure/application/motus/motus_profile_reader.py

@classmethod
@raise_parser_warnings
def read(cls, profile: BufferOrFilepath) -> DataFrame[MotusProfile]:
    """Read a mOTUs taxonomic profile from a file."""
    result = pd.read_table(
        filepath_or_buffer=profile,
        sep="\t",
        skiprows=3,
        header=None,
        index_col=False,
        dtype={1: "Int64"},
    )
    cls._check_num_columns(result, MotusProfile)
    result.columns = [
        MotusProfile.consensus_taxonomy,
        MotusProfile.ncbi_tax_id,
        MotusProfile.read_count,
    ]
    return result

Functions¶

motus_profile_standardisation_service ¶

Provide a standardisation service for mOTUs profiles.

Classes¶

MotusProfileStandardisationService ¶

Define a standardisation service for mOTUs profiles.

Source code in src/taxpasta/infrastructure/application/motus/motus_profile_standardisation_service.py

class MotusProfileStandardisationService(ProfileStandardisationService):
    """Define a standardisation service for mOTUs profiles."""

    @classmethod
    @pa.check_types(lazy=True)
    def transform(cls, profile: DataFrame[MotusProfile]) -> DataFrame[StandardProfile]:
        """
        Tidy up and standardize a given mOTUs profile.

        Args:
            profile: A taxonomic profile generated by mOTUs.

        Returns:
            A standardized profile.

        """
        temp = (
            profile.loc[
                # Ignore entries with zero read count.
                profile[MotusProfile.read_count] > 0,
                [MotusProfile.ncbi_tax_id, MotusProfile.read_count],
            ]
            .copy()
            .rename(
                columns={
                    MotusProfile.ncbi_tax_id: StandardProfile.taxonomy_id,
                    MotusProfile.read_count: StandardProfile.count,
                }
            )
        )
        # Split profile into entries with known and unknown tax ID.
        result = (
            temp.loc[temp[StandardProfile.taxonomy_id].notna(), :]
            .copy()
            .assign(
                **{
                    StandardProfile.taxonomy_id: lambda df: df[
                        StandardProfile.taxonomy_id
                    ].astype(int)
                }
            )
            # FIXME (Moritz): Apparently, mOTUs profiles can contain duplicate tax IDs.
            #  Clarify with Sofia and Maxime. For now, sum up read counts.
            #  https://github.com/taxprofiler/taxpasta/issues/46
            .groupby(StandardProfile.taxonomy_id, as_index=False, sort=False)
            .sum()
        )
        # Sum up all remaining read counts without tax ID to be 'unassigned'.
        return pd.concat(
            [
                result,
                pd.DataFrame(
                    {
                        StandardProfile.taxonomy_id: [0],
                        StandardProfile.count: [
                            temp.loc[
                                temp[StandardProfile.taxonomy_id].isna(),
                                StandardProfile.count,
                            ].sum()
                        ],
                    },
                    dtype=int,
                ),
            ],
            ignore_index=True,
        )

Functions¶

transform(profile: DataFrame[MotusProfile]) -> DataFrame[StandardProfile] classmethod ¶

Tidy up and standardize a given mOTUs profile.

Parameters:

Name	Type	Description	Default
`profile`	`DataFrame[MotusProfile]`	A taxonomic profile generated by mOTUs.	required

Returns:

Type	Description
`DataFrame[StandardProfile]`	A standardized profile.

Source code in src/taxpasta/infrastructure/application/motus/motus_profile_standardisation_service.py

@classmethod
@pa.check_types(lazy=True)
def transform(cls, profile: DataFrame[MotusProfile]) -> DataFrame[StandardProfile]:
    """
    Tidy up and standardize a given mOTUs profile.

    Args:
        profile: A taxonomic profile generated by mOTUs.

    Returns:
        A standardized profile.

    """
    temp = (
        profile.loc[
            # Ignore entries with zero read count.
            profile[MotusProfile.read_count] > 0,
            [MotusProfile.ncbi_tax_id, MotusProfile.read_count],
        ]
        .copy()
        .rename(
            columns={
                MotusProfile.ncbi_tax_id: StandardProfile.taxonomy_id,
                MotusProfile.read_count: StandardProfile.count,
            }
        )
    )
    # Split profile into entries with known and unknown tax ID.
    result = (
        temp.loc[temp[StandardProfile.taxonomy_id].notna(), :]
        .copy()
        .assign(
            **{
                StandardProfile.taxonomy_id: lambda df: df[
                    StandardProfile.taxonomy_id
                ].astype(int)
            }
        )
        # FIXME (Moritz): Apparently, mOTUs profiles can contain duplicate tax IDs.
        #  Clarify with Sofia and Maxime. For now, sum up read counts.
        #  https://github.com/taxprofiler/taxpasta/issues/46
        .groupby(StandardProfile.taxonomy_id, as_index=False, sort=False)
        .sum()
    )
    # Sum up all remaining read counts without tax ID to be 'unassigned'.
    return pd.concat(
        [
            result,
            pd.DataFrame(
                {
                    StandardProfile.taxonomy_id: [0],
                    StandardProfile.count: [
                        temp.loc[
                            temp[StandardProfile.taxonomy_id].isna(),
                            StandardProfile.count,
                        ].sum()
                    ],
                },
                dtype=int,
            ),
        ],
        ignore_index=True,
    )

`sample_sheet` ¶

Provide a description of samples and profile locations.

Classes¶

SampleSheet ¶

Bases: DataFrameModel

Define a description of samples and profile locations.

Source code in src/taxpasta/infrastructure/application/sample_sheet.py

class SampleSheet(pa.DataFrameModel):
    """Define a description of samples and profile locations."""

    sample: Series[str] = pa.Field()
    profile: Series[str] = pa.Field()  # type: ignore

    @pa.dataframe_check
    @classmethod
    def check_number_samples(cls, table: DataFrame) -> bool:
        """Check that there are at least two samples."""
        return (table[cls.sample].notnull() & table[cls.profile].notnull()).sum() > 1

    @pa.check("profile", name="profile_presence")
    @classmethod
    def check_profile_presence(
        cls, profile: Series[str]  # type: ignore
    ) -> Series[bool]:
        """Check that every profile is present at the specified location."""
        return cast(Series[bool], profile.map(lambda path: Path(path).is_file()))

    class Config:
        """Configure the schema model."""

        coerce = True
        ordered = True
        strict = True

Attributes¶

profile: Series[str] = pa.Field() class-attribute instance-attribute ¶

sample: Series[str] = pa.Field() class-attribute instance-attribute ¶

Classes¶

Config ¶

Configure the schema model.

Source code in src/taxpasta/infrastructure/application/sample_sheet.py

class Config:
    """Configure the schema model."""

    coerce = True
    ordered = True
    strict = True

Attributes¶

coerce = True class-attribute instance-attribute ¶

ordered = True class-attribute instance-attribute ¶

strict = True class-attribute instance-attribute ¶

Functions¶

check_number_samples(table: DataFrame) -> bool classmethod ¶

Check that there are at least two samples.

Source code in src/taxpasta/infrastructure/application/sample_sheet.py

@pa.dataframe_check
@classmethod
def check_number_samples(cls, table: DataFrame) -> bool:
    """Check that there are at least two samples."""
    return (table[cls.sample].notnull() & table[cls.profile].notnull()).sum() > 1

check_profile_presence(profile: Series[str]) -> Series[bool] classmethod ¶

Check that every profile is present at the specified location.

Source code in src/taxpasta/infrastructure/application/sample_sheet.py

@pa.check("profile", name="profile_presence")
@classmethod
def check_profile_presence(
    cls, profile: Series[str]  # type: ignore
) -> Series[bool]:
    """Check that every profile is present at the specified location."""
    return cast(Series[bool], profile.map(lambda path: Path(path).is_file()))

`standard_profile_file_format` ¶

Provide a service for supported tabular file formats.

Classes¶

StandardProfileFileFormat ¶

Bases: str, DependencyCheckMixin, Enum

Define the supported standardized profile file formats.

Source code in src/taxpasta/infrastructure/application/standard_profile_file_format.py

@unique
class StandardProfileFileFormat(str, DependencyCheckMixin, Enum):
    """Define the supported standardized profile file formats."""

    TSV = "TSV"
    CSV = "CSV"
    ODS = "ODS"
    XLSX = "XLSX"
    arrow = "arrow"
    parquet = "parquet"

Attributes¶

CSV = 'CSV' class-attribute instance-attribute ¶

ODS = 'ODS' class-attribute instance-attribute ¶

TSV = 'TSV' class-attribute instance-attribute ¶

XLSX = 'XLSX' class-attribute instance-attribute ¶

arrow = 'arrow' class-attribute instance-attribute ¶

parquet = 'parquet' class-attribute instance-attribute ¶

`standard_profile_writer` ¶

Modules¶

arrow_standard_profile_writer ¶

Provide an arrow writer.

Attributes¶ Classes¶

ArrowStandardProfileWriter ¶

Define the arrow writer.

Source code in src/taxpasta/infrastructure/application/standard_profile_writer/arrow_standard_profile_writer.py

class ArrowStandardProfileWriter(StandardProfileWriter):
    """Define the arrow writer."""

    @classmethod
    def write(
        cls,
        profile: DataFrame[StandardProfile],
        target: BinaryBufferOrFilepath,
        **kwargs,
    ) -> None:
        """Write the given standardized profile to the given buffer or file."""
        profile.to_feather(target, **kwargs)

Functions¶

write(profile: DataFrame[StandardProfile], target: BinaryBufferOrFilepath, **kwargs) -> None classmethod ¶

Write the given standardized profile to the given buffer or file.

Source code in src/taxpasta/infrastructure/application/standard_profile_writer/arrow_standard_profile_writer.py

@classmethod
def write(
    cls,
    profile: DataFrame[StandardProfile],
    target: BinaryBufferOrFilepath,
    **kwargs,
) -> None:
    """Write the given standardized profile to the given buffer or file."""
    profile.to_feather(target, **kwargs)

csv_standard_profile_writer ¶

Provide a CSV writer.

Attributes¶ Classes¶

CSVStandardProfileWriter ¶

Define the CSV writer.

Source code in src/taxpasta/infrastructure/application/standard_profile_writer/csv_standard_profile_writer.py

class CSVStandardProfileWriter(StandardProfileWriter):
    """Define the CSV writer."""

    @classmethod
    def write(
        cls, profile: DataFrame[StandardProfile], target: BufferOrFilepath, **kwargs
    ) -> None:
        """Write the given standardized profile to the given buffer or file."""
        profile.to_csv(target, index=False, **kwargs)

Functions¶

write(profile: DataFrame[StandardProfile], target: BufferOrFilepath, **kwargs) -> None classmethod ¶

Write the given standardized profile to the given buffer or file.

Source code in src/taxpasta/infrastructure/application/standard_profile_writer/csv_standard_profile_writer.py

@classmethod
def write(
    cls, profile: DataFrame[StandardProfile], target: BufferOrFilepath, **kwargs
) -> None:
    """Write the given standardized profile to the given buffer or file."""
    profile.to_csv(target, index=False, **kwargs)

ods_standard_profile_writer ¶

Provide an ODS writer.

Attributes¶ Classes¶

ODSStandardProfileWriter ¶

Define the ODS writer.

Source code in src/taxpasta/infrastructure/application/standard_profile_writer/ods_standard_profile_writer.py

class ODSStandardProfileWriter(StandardProfileWriter):
    """Define the ODS writer."""

    @classmethod
    def write(
        cls,
        profile: DataFrame[StandardProfile],
        target: BinaryBufferOrFilepath,
        **kwargs,
    ) -> None:
        """Write the given standardized profile to the given buffer or file."""
        profile.to_excel(target, index=False, engine="odf", **kwargs)

Functions¶

write(profile: DataFrame[StandardProfile], target: BinaryBufferOrFilepath, **kwargs) -> None classmethod ¶

Write the given standardized profile to the given buffer or file.

Source code in src/taxpasta/infrastructure/application/standard_profile_writer/ods_standard_profile_writer.py

@classmethod
def write(
    cls,
    profile: DataFrame[StandardProfile],
    target: BinaryBufferOrFilepath,
    **kwargs,
) -> None:
    """Write the given standardized profile to the given buffer or file."""
    profile.to_excel(target, index=False, engine="odf", **kwargs)

parquet_standard_profile_writer ¶

Provide an parquet writer.

Attributes¶ Classes¶

ParquetStandardProfileWriter ¶

Define the parquet writer.

Source code in src/taxpasta/infrastructure/application/standard_profile_writer/parquet_standard_profile_writer.py

class ParquetStandardProfileWriter(StandardProfileWriter):
    """Define the parquet writer."""

    @classmethod
    def write(
        cls,
        profile: DataFrame[StandardProfile],
        target: BinaryBufferOrFilepath,
        **kwargs,
    ) -> None:
        """Write the given standardized profile to the given buffer or file."""
        profile.to_parquet(target, **kwargs)

Functions¶

write(profile: DataFrame[StandardProfile], target: BinaryBufferOrFilepath, **kwargs) -> None classmethod ¶

Write the given standardized profile to the given buffer or file.

Source code in src/taxpasta/infrastructure/application/standard_profile_writer/parquet_standard_profile_writer.py

@classmethod
def write(
    cls,
    profile: DataFrame[StandardProfile],
    target: BinaryBufferOrFilepath,
    **kwargs,
) -> None:
    """Write the given standardized profile to the given buffer or file."""
    profile.to_parquet(target, **kwargs)

tsv_standard_profile_writer ¶

Provide an TSV writer.

Attributes¶ Classes¶

TSVStandardProfileWriter ¶

Define the TSV writer.

Source code in src/taxpasta/infrastructure/application/standard_profile_writer/tsv_standard_profile_writer.py

class TSVStandardProfileWriter(StandardProfileWriter):
    """Define the TSV writer."""

    @classmethod
    def write(
        cls, profile: DataFrame[StandardProfile], target: BufferOrFilepath, **kwargs
    ) -> None:
        """Write the given standardized profile to the given buffer or file."""
        profile.to_csv(target, sep="\t", index=False, **kwargs)

Functions¶

write(profile: DataFrame[StandardProfile], target: BufferOrFilepath, **kwargs) -> None classmethod ¶

Write the given standardized profile to the given buffer or file.

Source code in src/taxpasta/infrastructure/application/standard_profile_writer/tsv_standard_profile_writer.py

@classmethod
def write(
    cls, profile: DataFrame[StandardProfile], target: BufferOrFilepath, **kwargs
) -> None:
    """Write the given standardized profile to the given buffer or file."""
    profile.to_csv(target, sep="\t", index=False, **kwargs)

xlsx_standard_profile_writer ¶

Provide an XLSX writer.

Attributes¶ Classes¶

XLSXStandardProfileWriter ¶

Define the XLSX writer.

Source code in src/taxpasta/infrastructure/application/standard_profile_writer/xlsx_standard_profile_writer.py

class XLSXStandardProfileWriter(StandardProfileWriter):
    """Define the XLSX writer."""

    @classmethod
    def write(
        cls,
        profile: DataFrame[StandardProfile],
        target: BinaryBufferOrFilepath,
        **kwargs,
    ) -> None:
        """Write the given standardized profile to the given buffer or file."""
        profile.to_excel(target, index=False, engine="openpyxl", **kwargs)

Functions¶

write(profile: DataFrame[StandardProfile], target: BinaryBufferOrFilepath, **kwargs) -> None classmethod ¶

Write the given standardized profile to the given buffer or file.

Source code in src/taxpasta/infrastructure/application/standard_profile_writer/xlsx_standard_profile_writer.py

@classmethod
def write(
    cls,
    profile: DataFrame[StandardProfile],
    target: BinaryBufferOrFilepath,
    **kwargs,
) -> None:
    """Write the given standardized profile to the given buffer or file."""
    profile.to_excel(target, index=False, engine="openpyxl", **kwargs)

`supported_profiler` ¶

Provide an enumeration of supported taxonomic profilers.

Classes¶

SupportedProfiler ¶

Bases: str, Enum

Define supported taxonomic profilers.

Source code in src/taxpasta/infrastructure/application/supported_profiler.py

@unique
class SupportedProfiler(str, Enum):
    """Define supported taxonomic profilers."""

    bracken = "bracken"
    centrifuge = "centrifuge"
    diamond = "diamond"
    ganon = "ganon"
    kaiju = "kaiju"
    kmcp = "kmcp"
    kraken2 = "kraken2"
    krakenuniq = "krakenuniq"
    megan6 = "megan6"
    metaphlan = "metaphlan"
    motus = "motus"

Attributes¶

bracken = 'bracken' class-attribute instance-attribute ¶

centrifuge = 'centrifuge' class-attribute instance-attribute ¶

diamond = 'diamond' class-attribute instance-attribute ¶

ganon = 'ganon' class-attribute instance-attribute ¶

kaiju = 'kaiju' class-attribute instance-attribute ¶

kmcp = 'kmcp' class-attribute instance-attribute ¶

kraken2 = 'kraken2' class-attribute instance-attribute ¶

krakenuniq = 'krakenuniq' class-attribute instance-attribute ¶

megan6 = 'megan6' class-attribute instance-attribute ¶

metaphlan = 'metaphlan' class-attribute instance-attribute ¶

motus = 'motus' class-attribute instance-attribute ¶

`table_reader` ¶

Modules¶

arrow_table_reader ¶

Provide an arrow reader.

Attributes¶ Classes¶

ArrowTableReader ¶

Bases: TableReader

Define the arrow reader.

Source code in src/taxpasta/infrastructure/application/table_reader/arrow_table_reader.py

class ArrowTableReader(TableReader):
    """Define the arrow reader."""

    @classmethod
    def read(cls, source: BinaryBufferOrFilepath, **kwargs) -> pd.DataFrame:
        """Read arrow from the given source."""
        return pd.read_feather(source, **kwargs)

Functions¶

read(source: BinaryBufferOrFilepath, **kwargs) -> pd.DataFrame classmethod ¶

Read arrow from the given source.

Source code in src/taxpasta/infrastructure/application/table_reader/arrow_table_reader.py

@classmethod
def read(cls, source: BinaryBufferOrFilepath, **kwargs) -> pd.DataFrame:
    """Read arrow from the given source."""
    return pd.read_feather(source, **kwargs)

csv_table_reader ¶

Provide a CSV reader.

Attributes¶ Classes¶

CSVTableReader ¶

Bases: TableReader

Define the CSV reader.

Source code in src/taxpasta/infrastructure/application/table_reader/csv_table_reader.py

class CSVTableReader(TableReader):
    """Define the CSV reader."""

    @classmethod
    def read(cls, source: BufferOrFilepath, **kwargs) -> pd.DataFrame:
        """Read CSV from the given source."""
        return pd.read_csv(source, **kwargs)

Functions¶

read(source: BufferOrFilepath, **kwargs) -> pd.DataFrame classmethod ¶

Read CSV from the given source.

Source code in src/taxpasta/infrastructure/application/table_reader/csv_table_reader.py

@classmethod
def read(cls, source: BufferOrFilepath, **kwargs) -> pd.DataFrame:
    """Read CSV from the given source."""
    return pd.read_csv(source, **kwargs)

ods_table_reader ¶

Provide an ODS reader.

Attributes¶ Classes¶

ODSTableReader ¶

Bases: TableReader

Define the ODS reader.

Source code in src/taxpasta/infrastructure/application/table_reader/ods_table_reader.py

class ODSTableReader(TableReader):
    """Define the ODS reader."""

    @classmethod
    def read(cls, source: BinaryBufferOrFilepath, **kwargs) -> pd.DataFrame:
        """Read ODS from the given source."""
        return pd.read_excel(source, engine="odf", **kwargs)

Functions¶

read(source: BinaryBufferOrFilepath, **kwargs) -> pd.DataFrame classmethod ¶

Read ODS from the given source.

Source code in src/taxpasta/infrastructure/application/table_reader/ods_table_reader.py

@classmethod
def read(cls, source: BinaryBufferOrFilepath, **kwargs) -> pd.DataFrame:
    """Read ODS from the given source."""
    return pd.read_excel(source, engine="odf", **kwargs)

parquet_table_reader ¶

Provide an parquet reader.

Attributes¶ Classes¶

ParquetTableReader ¶

Bases: TableReader

Define the parquet reader.

Source code in src/taxpasta/infrastructure/application/table_reader/parquet_table_reader.py

class ParquetTableReader(TableReader):
    """Define the parquet reader."""

    @classmethod
    def read(cls, source: BinaryBufferOrFilepath, **kwargs) -> pd.DataFrame:
        """Read parquet from the given source."""
        return pd.read_parquet(source, **kwargs)

Functions¶

read(source: BinaryBufferOrFilepath, **kwargs) -> pd.DataFrame classmethod ¶

Read parquet from the given source.

Source code in src/taxpasta/infrastructure/application/table_reader/parquet_table_reader.py

@classmethod
def read(cls, source: BinaryBufferOrFilepath, **kwargs) -> pd.DataFrame:
    """Read parquet from the given source."""
    return pd.read_parquet(source, **kwargs)

tsv_table_reader ¶

Provide a TSV reader.

Attributes¶ Classes¶

TSVTableReader ¶

Bases: TableReader

Define the TSV reader.

Source code in src/taxpasta/infrastructure/application/table_reader/tsv_table_reader.py

class TSVTableReader(TableReader):
    """Define the TSV reader."""

    @classmethod
    def read(cls, source: BufferOrFilepath, **kwargs) -> pd.DataFrame:
        """Read TSV from the given source."""
        return pd.read_table(source, sep="\t", **kwargs)

Functions¶

read(source: BufferOrFilepath, **kwargs) -> pd.DataFrame classmethod ¶

Read TSV from the given source.

Source code in src/taxpasta/infrastructure/application/table_reader/tsv_table_reader.py

@classmethod
def read(cls, source: BufferOrFilepath, **kwargs) -> pd.DataFrame:
    """Read TSV from the given source."""
    return pd.read_table(source, sep="\t", **kwargs)

xlsx_table_reader ¶

Provide an XLSX reader.

Attributes¶ Classes¶

XLSXTableReader ¶

Bases: TableReader

Define the XLSX reader.

Source code in src/taxpasta/infrastructure/application/table_reader/xlsx_table_reader.py

class XLSXTableReader(TableReader):
    """Define the XLSX reader."""

    @classmethod
    def read(cls, source: BinaryBufferOrFilepath, **kwargs) -> pd.DataFrame:
        """Read XLSX from the given source."""
        return pd.read_excel(source, engine="openpyxl", **kwargs)

Functions¶

read(source: BinaryBufferOrFilepath, **kwargs) -> pd.DataFrame classmethod ¶

Read XLSX from the given source.

Source code in src/taxpasta/infrastructure/application/table_reader/xlsx_table_reader.py

@classmethod
def read(cls, source: BinaryBufferOrFilepath, **kwargs) -> pd.DataFrame:
    """Read XLSX from the given source."""
    return pd.read_excel(source, engine="openpyxl", **kwargs)

`table_reader_file_format` ¶

Provide a service for supported tabular file formats.

Classes¶

TableReaderFileFormat ¶

Bases: str, DependencyCheckMixin, Enum

Define the supported tabular file formats.

Source code in src/taxpasta/infrastructure/application/table_reader_file_format.py

@unique
class TableReaderFileFormat(str, DependencyCheckMixin, Enum):
    """Define the supported tabular file formats."""

    TSV = "TSV"
    CSV = "CSV"
    ODS = "ODS"
    XLSX = "XLSX"
    arrow = "arrow"
    parquet = "parquet"

Attributes¶

CSV = 'CSV' class-attribute instance-attribute ¶

ODS = 'ODS' class-attribute instance-attribute ¶

TSV = 'TSV' class-attribute instance-attribute ¶

XLSX = 'XLSX' class-attribute instance-attribute ¶

arrow = 'arrow' class-attribute instance-attribute ¶

parquet = 'parquet' class-attribute instance-attribute ¶

`tidy_observation_table_file_format` ¶

Provide a service for supported tabular file formats.

Classes¶

TidyObservationTableFileFormat ¶

Bases: str, DependencyCheckMixin, Enum

Define the supported tabular file formats.

Source code in src/taxpasta/infrastructure/application/tidy_observation_table_file_format.py

@unique
class TidyObservationTableFileFormat(str, DependencyCheckMixin, Enum):
    """Define the supported tabular file formats."""

    TSV = "TSV"
    CSV = "CSV"
    ODS = "ODS"
    XLSX = "XLSX"
    arrow = "arrow"
    parquet = "parquet"

Attributes¶

CSV = 'CSV' class-attribute instance-attribute ¶

ODS = 'ODS' class-attribute instance-attribute ¶

TSV = 'TSV' class-attribute instance-attribute ¶

XLSX = 'XLSX' class-attribute instance-attribute ¶

arrow = 'arrow' class-attribute instance-attribute ¶

parquet = 'parquet' class-attribute instance-attribute ¶

`tidy_observation_table_writer` ¶

Modules¶

arrow_table_writer ¶

Provide an arrow writer.

Attributes¶ Classes¶

ArrowTidyObservationTableWriter ¶

Define the arrow writer.

Source code in src/taxpasta/infrastructure/application/tidy_observation_table_writer/arrow_table_writer.py

class ArrowTidyObservationTableWriter(TidyObservationTableWriter):
    """Define the arrow writer."""

    @classmethod
    def write(
        cls,
        table: DataFrame[TidyObservationTable],
        target: BinaryBufferOrFilepath,
        **kwargs,
    ) -> None:
        """Write the given table to the given buffer or file."""
        table.to_feather(target, **kwargs)

Functions¶

write(table: DataFrame[TidyObservationTable], target: BinaryBufferOrFilepath, **kwargs) -> None classmethod ¶

Write the given table to the given buffer or file.

Source code in src/taxpasta/infrastructure/application/tidy_observation_table_writer/arrow_table_writer.py

@classmethod
def write(
    cls,
    table: DataFrame[TidyObservationTable],
    target: BinaryBufferOrFilepath,
    **kwargs,
) -> None:
    """Write the given table to the given buffer or file."""
    table.to_feather(target, **kwargs)

csv_table_writer ¶

Provide a CSV writer.

Attributes¶ Classes¶

CSVTidyObservationTableWriter ¶

Define the CSV writer.

Source code in src/taxpasta/infrastructure/application/tidy_observation_table_writer/csv_table_writer.py

class CSVTidyObservationTableWriter(TidyObservationTableWriter):
    """Define the CSV writer."""

    @classmethod
    def write(
        cls, table: DataFrame[TidyObservationTable], target: BufferOrFilepath, **kwargs
    ) -> None:
        """Write the given table to the given buffer or file."""
        table.to_csv(target, index=False, **kwargs)

Functions¶

write(table: DataFrame[TidyObservationTable], target: BufferOrFilepath, **kwargs) -> None classmethod ¶

Write the given table to the given buffer or file.

Source code in src/taxpasta/infrastructure/application/tidy_observation_table_writer/csv_table_writer.py

@classmethod
def write(
    cls, table: DataFrame[TidyObservationTable], target: BufferOrFilepath, **kwargs
) -> None:
    """Write the given table to the given buffer or file."""
    table.to_csv(target, index=False, **kwargs)

ods_table_writer ¶

Provide an ODS writer.

Attributes¶ Classes¶

ODSTidyObservationTableWriter ¶

Define the ODS writer.

Source code in src/taxpasta/infrastructure/application/tidy_observation_table_writer/ods_table_writer.py

class ODSTidyObservationTableWriter(TidyObservationTableWriter):
    """Define the ODS writer."""

    @classmethod
    def write(
        cls,
        table: DataFrame[TidyObservationTable],
        target: BinaryBufferOrFilepath,
        **kwargs,
    ) -> None:
        """Write the given table to the given buffer or file."""
        table.to_excel(target, index=False, engine="odf", **kwargs)

Functions¶

write(table: DataFrame[TidyObservationTable], target: BinaryBufferOrFilepath, **kwargs) -> None classmethod ¶

Write the given table to the given buffer or file.

Source code in src/taxpasta/infrastructure/application/tidy_observation_table_writer/ods_table_writer.py

@classmethod
def write(
    cls,
    table: DataFrame[TidyObservationTable],
    target: BinaryBufferOrFilepath,
    **kwargs,
) -> None:
    """Write the given table to the given buffer or file."""
    table.to_excel(target, index=False, engine="odf", **kwargs)

parquet_table_writer ¶

Provide an parquet writer.

Attributes¶ Classes¶

ParquetTidyObservationTableWriter ¶

Define the parquet writer.

Source code in src/taxpasta/infrastructure/application/tidy_observation_table_writer/parquet_table_writer.py

class ParquetTidyObservationTableWriter(TidyObservationTableWriter):
    """Define the parquet writer."""

    @classmethod
    def write(
        cls,
        table: DataFrame[TidyObservationTable],
        target: BinaryBufferOrFilepath,
        **kwargs,
    ) -> None:
        """Write the given table to the given buffer or file."""
        table.to_parquet(target, **kwargs)

Functions¶

write(table: DataFrame[TidyObservationTable], target: BinaryBufferOrFilepath, **kwargs) -> None classmethod ¶

Write the given table to the given buffer or file.

Source code in src/taxpasta/infrastructure/application/tidy_observation_table_writer/parquet_table_writer.py

@classmethod
def write(
    cls,
    table: DataFrame[TidyObservationTable],
    target: BinaryBufferOrFilepath,
    **kwargs,
) -> None:
    """Write the given table to the given buffer or file."""
    table.to_parquet(target, **kwargs)

tsv_table_writer ¶

Provide an TSV writer.

Attributes¶ Classes¶

TSVTidyObservationTableWriter ¶

Define the TSV writer.

Source code in src/taxpasta/infrastructure/application/tidy_observation_table_writer/tsv_table_writer.py

class TSVTidyObservationTableWriter(TidyObservationTableWriter):
    """Define the TSV writer."""

    @classmethod
    def write(
        cls, table: DataFrame[TidyObservationTable], target: BufferOrFilepath, **kwargs
    ) -> None:
        """Write the given table to the given buffer or file."""
        table.to_csv(target, sep="\t", index=False, **kwargs)

Functions¶

write(table: DataFrame[TidyObservationTable], target: BufferOrFilepath, **kwargs) -> None classmethod ¶

Write the given table to the given buffer or file.

Source code in src/taxpasta/infrastructure/application/tidy_observation_table_writer/tsv_table_writer.py

@classmethod
def write(
    cls, table: DataFrame[TidyObservationTable], target: BufferOrFilepath, **kwargs
) -> None:
    """Write the given table to the given buffer or file."""
    table.to_csv(target, sep="\t", index=False, **kwargs)

xlsx_table_writer ¶

Provide an XLSX writer.

Attributes¶ Classes¶

XLSXTidyObservationTableWriter ¶

Define the XLSX writer.

Source code in src/taxpasta/infrastructure/application/tidy_observation_table_writer/xlsx_table_writer.py

class XLSXTidyObservationTableWriter(TidyObservationTableWriter):
    """Define the XLSX writer."""

    @classmethod
    def write(
        cls,
        table: DataFrame[TidyObservationTable],
        target: BinaryBufferOrFilepath,
        **kwargs,
    ) -> None:
        """Write the given table to the given buffer or file."""
        table.to_excel(target, index=False, engine="openpyxl", **kwargs)

Functions¶

write(table: DataFrame[TidyObservationTable], target: BinaryBufferOrFilepath, **kwargs) -> None classmethod ¶

Write the given table to the given buffer or file.

Source code in src/taxpasta/infrastructure/application/tidy_observation_table_writer/xlsx_table_writer.py

@classmethod
def write(
    cls,
    table: DataFrame[TidyObservationTable],
    target: BinaryBufferOrFilepath,
    **kwargs,
) -> None:
    """Write the given table to the given buffer or file."""
    table.to_excel(target, index=False, engine="openpyxl", **kwargs)

`wide_observation_table_file_format` ¶

Provide a service for supported container file formats.

Classes¶

WideObservationTableFileFormat ¶

Bases: str, DependencyCheckMixin, Enum

Define the supported container file formats.

Source code in src/taxpasta/infrastructure/application/wide_observation_table_file_format.py

@unique
class WideObservationTableFileFormat(str, DependencyCheckMixin, Enum):
    """Define the supported container file formats."""

    TSV = "TSV"
    CSV = "CSV"
    ODS = "ODS"
    XLSX = "XLSX"
    arrow = "arrow"
    parquet = "parquet"
    BIOM = "BIOM"

Attributes¶

BIOM = 'BIOM' class-attribute instance-attribute ¶

CSV = 'CSV' class-attribute instance-attribute ¶

ODS = 'ODS' class-attribute instance-attribute ¶

TSV = 'TSV' class-attribute instance-attribute ¶

XLSX = 'XLSX' class-attribute instance-attribute ¶

arrow = 'arrow' class-attribute instance-attribute ¶

parquet = 'parquet' class-attribute instance-attribute ¶

`wide_observation_table_writer` ¶

Modules¶

arrow_wide_observation_table_writer ¶

Provide an arrow writer.

Attributes¶ Classes¶

ArrowWideObservationTableWriter ¶

Define the arrow writer.

Source code in src/taxpasta/infrastructure/application/wide_observation_table_writer/arrow_wide_observation_table_writer.py

class ArrowWideObservationTableWriter(WideObservationTableWriter):
    """Define the arrow writer."""

    @classmethod
    def write(
        cls,
        matrix: DataFrame[WideObservationTable],
        target: BinaryBufferOrFilepath,
        **kwargs,
    ) -> None:
        """Write the given table to the given buffer or file."""
        matrix.to_feather(target, **kwargs)

Functions¶

write(matrix: DataFrame[WideObservationTable], target: BinaryBufferOrFilepath, **kwargs) -> None classmethod ¶

Write the given table to the given buffer or file.

Source code in src/taxpasta/infrastructure/application/wide_observation_table_writer/arrow_wide_observation_table_writer.py

@classmethod
def write(
    cls,
    matrix: DataFrame[WideObservationTable],
    target: BinaryBufferOrFilepath,
    **kwargs,
) -> None:
    """Write the given table to the given buffer or file."""
    matrix.to_feather(target, **kwargs)

biom_wide_observation_table_writer ¶

Provide a Biological Observation Matrix (BIOM) writer.

Attributes¶ Classes¶

BIOMWideObservationTableWriter ¶

Define the Biological Observation Matrix (BIOM) writer.

Source code in src/taxpasta/infrastructure/application/wide_observation_table_writer/biom_wide_observation_table_writer.py

class BIOMWideObservationTableWriter(WideObservationTableWriter):
    """Define the Biological Observation Matrix (BIOM) writer."""

    @classmethod
    def write(
        cls,
        matrix: DataFrame[WideObservationTable],
        target: Filepath,
        taxonomy: Optional[TaxonomyService] = None,
        generated_by: str = "taxpasta",
        **kwargs,
    ) -> None:
        """Write the given data to the given buffer or file."""
        # Drop unclassified reads.
        matrix = matrix.loc[matrix.iloc[:, 0] != 0].copy()
        if taxonomy is not None:
            observation_meta = taxonomy.format_biom_taxonomy(matrix)
            tmp = taxonomy.add_rank_lineage(matrix)
            for ranks, meta in zip(tmp.rank_lineage, observation_meta):
                meta["rank_lineage"] = ranks
        else:
            observation_meta = None
        result = Table(
            data=matrix.iloc[:, 1:].values,
            observation_ids=matrix.iloc[:, 0].astype(str),
            sample_ids=matrix.columns[1:].astype(str),
            observation_metadata=observation_meta,
            create_date=datetime.utcnow().isoformat(timespec="microseconds"),
        )
        with biom_open(str(target), permission="w") as handle:
            result.to_hdf5(handle, generated_by=generated_by)

Functions¶

write(matrix: DataFrame[WideObservationTable], target: Filepath, taxonomy: Optional[TaxonomyService] = None, generated_by: str = 'taxpasta', **kwargs) -> None

classmethod ¶

Write the given data to the given buffer or file.

Source code in src/taxpasta/infrastructure/application/wide_observation_table_writer/biom_wide_observation_table_writer.py

@classmethod
def write(
    cls,
    matrix: DataFrame[WideObservationTable],
    target: Filepath,
    taxonomy: Optional[TaxonomyService] = None,
    generated_by: str = "taxpasta",
    **kwargs,
) -> None:
    """Write the given data to the given buffer or file."""
    # Drop unclassified reads.
    matrix = matrix.loc[matrix.iloc[:, 0] != 0].copy()
    if taxonomy is not None:
        observation_meta = taxonomy.format_biom_taxonomy(matrix)
        tmp = taxonomy.add_rank_lineage(matrix)
        for ranks, meta in zip(tmp.rank_lineage, observation_meta):
            meta["rank_lineage"] = ranks
    else:
        observation_meta = None
    result = Table(
        data=matrix.iloc[:, 1:].values,
        observation_ids=matrix.iloc[:, 0].astype(str),
        sample_ids=matrix.columns[1:].astype(str),
        observation_metadata=observation_meta,
        create_date=datetime.utcnow().isoformat(timespec="microseconds"),
    )
    with biom_open(str(target), permission="w") as handle:
        result.to_hdf5(handle, generated_by=generated_by)

csv_wide_observation_table_writer ¶

Provide a CSV writer.

Attributes¶ Classes¶

CSVWideObservationTableWriter ¶

Define the CSV writer.

Source code in src/taxpasta/infrastructure/application/wide_observation_table_writer/csv_wide_observation_table_writer.py

class CSVWideObservationTableWriter(WideObservationTableWriter):
    """Define the CSV writer."""

    @classmethod
    def write(
        cls, matrix: DataFrame[WideObservationTable], target: BufferOrFilepath, **kwargs
    ) -> None:
        """Write the given table to the given buffer or file."""
        matrix.to_csv(target, index=False, **kwargs)

Functions¶

write(matrix: DataFrame[WideObservationTable], target: BufferOrFilepath, **kwargs) -> None classmethod ¶

Write the given table to the given buffer or file.

Source code in src/taxpasta/infrastructure/application/wide_observation_table_writer/csv_wide_observation_table_writer.py

@classmethod
def write(
    cls, matrix: DataFrame[WideObservationTable], target: BufferOrFilepath, **kwargs
) -> None:
    """Write the given table to the given buffer or file."""
    matrix.to_csv(target, index=False, **kwargs)

ods_wide_observation_table_writer ¶

Provide an ODS writer.

Attributes¶ Classes¶

ODSWideObservationTableWriter ¶

Define the ODS writer.

Source code in src/taxpasta/infrastructure/application/wide_observation_table_writer/ods_wide_observation_table_writer.py

class ODSWideObservationTableWriter(WideObservationTableWriter):
    """Define the ODS writer."""

    @classmethod
    def write(
        cls,
        matrix: DataFrame[WideObservationTable],
        target: BinaryBufferOrFilepath,
        **kwargs,
    ) -> None:
        """Write the given table to the given buffer or file."""
        matrix.to_excel(target, index=False, engine="odf", **kwargs)

Functions¶

write(matrix: DataFrame[WideObservationTable], target: BinaryBufferOrFilepath, **kwargs) -> None classmethod ¶

Write the given table to the given buffer or file.

Source code in src/taxpasta/infrastructure/application/wide_observation_table_writer/ods_wide_observation_table_writer.py

@classmethod
def write(
    cls,
    matrix: DataFrame[WideObservationTable],
    target: BinaryBufferOrFilepath,
    **kwargs,
) -> None:
    """Write the given table to the given buffer or file."""
    matrix.to_excel(target, index=False, engine="odf", **kwargs)

parquet_wide_observation_table_writer ¶

Provide an parquet writer.

Attributes¶ Classes¶

ParquetWideObservationTableWriter ¶

Define the parquet writer.

Source code in src/taxpasta/infrastructure/application/wide_observation_table_writer/parquet_wide_observation_table_writer.py

class ParquetWideObservationTableWriter(WideObservationTableWriter):
    """Define the parquet writer."""

    @classmethod
    def write(
        cls,
        matrix: DataFrame[WideObservationTable],
        target: BinaryBufferOrFilepath,
        **kwargs,
    ) -> None:
        """Write the given table to the given buffer or file."""
        matrix.to_parquet(target, **kwargs)

Functions¶

write(matrix: DataFrame[WideObservationTable], target: BinaryBufferOrFilepath, **kwargs) -> None classmethod ¶

Write the given table to the given buffer or file.

Source code in src/taxpasta/infrastructure/application/wide_observation_table_writer/parquet_wide_observation_table_writer.py

@classmethod
def write(
    cls,
    matrix: DataFrame[WideObservationTable],
    target: BinaryBufferOrFilepath,
    **kwargs,
) -> None:
    """Write the given table to the given buffer or file."""
    matrix.to_parquet(target, **kwargs)

tsv_wide_observation_table_writer ¶

Provide an TSV writer.

Attributes¶ Classes¶

TSVWideObservationTableWriter ¶

Define the TSV writer.

Source code in src/taxpasta/infrastructure/application/wide_observation_table_writer/tsv_wide_observation_table_writer.py

class TSVWideObservationTableWriter(WideObservationTableWriter):
    """Define the TSV writer."""

    @classmethod
    def write(
        cls, matrix: DataFrame[WideObservationTable], target: BufferOrFilepath, **kwargs
    ) -> None:
        """Write the given table to the given buffer or file."""
        matrix.to_csv(target, sep="\t", index=False, **kwargs)

Functions¶

write(matrix: DataFrame[WideObservationTable], target: BufferOrFilepath, **kwargs) -> None classmethod ¶

Write the given table to the given buffer or file.

Source code in src/taxpasta/infrastructure/application/wide_observation_table_writer/tsv_wide_observation_table_writer.py

@classmethod
def write(
    cls, matrix: DataFrame[WideObservationTable], target: BufferOrFilepath, **kwargs
) -> None:
    """Write the given table to the given buffer or file."""
    matrix.to_csv(target, sep="\t", index=False, **kwargs)

xlsx_wide_observation_table_writer ¶

Provide an XLSX writer.

Attributes¶ Classes¶

XLSXWideObservationTableWriter ¶