Skip to content

Modules

application

Classes

Modules

application_service_registry

Provide an application service registry.

Classes
ApplicationServiceRegistry

Define an application service registry.

Source code in src/taxpasta/infrastructure/application/application_service_registry.py
class ApplicationServiceRegistry:
    """Define an application service registry."""

    @classmethod
    def profile_reader(cls, profiler: SupportedProfiler) -> Type[ProfileReader]:
        """Return a profile reader of the correct type."""
        if profiler is SupportedProfiler.bracken:
            from .bracken import BrackenProfileReader

            return BrackenProfileReader
        elif profiler is SupportedProfiler.centrifuge:
            from .centrifuge import CentrifugeProfileReader

            return CentrifugeProfileReader
        elif profiler is SupportedProfiler.diamond:
            from .diamond import DiamondProfileReader

            return DiamondProfileReader
        elif profiler is SupportedProfiler.ganon:
            from .ganon import GanonProfileReader

            return GanonProfileReader
        elif profiler is SupportedProfiler.kaiju:
            from .kaiju import KaijuProfileReader

            return KaijuProfileReader
        elif profiler is SupportedProfiler.kmcp:
            from .kmcp import KMCPProfileReader

            return KMCPProfileReader
        elif profiler is SupportedProfiler.kraken2:
            from .kraken2 import Kraken2ProfileReader

            return Kraken2ProfileReader
        elif profiler is SupportedProfiler.krakenuniq:
            from .krakenuniq import KrakenUniqProfileReader

            return KrakenUniqProfileReader
        elif profiler is SupportedProfiler.megan6:
            from .megan6 import Megan6ProfileReader

            return Megan6ProfileReader
        elif profiler is SupportedProfiler.metaphlan:
            from .metaphlan import MetaphlanProfileReader

            return MetaphlanProfileReader
        elif profiler is SupportedProfiler.motus:
            from .motus import MotusProfileReader

            return MotusProfileReader

    @classmethod
    def profile_standardisation_service(
        cls, profiler: SupportedProfiler
    ) -> Type[ProfileStandardisationService]:
        """Return a profile standardisation service of the correct type."""
        if profiler is SupportedProfiler.bracken:
            from .bracken import BrackenProfileStandardisationService

            return BrackenProfileStandardisationService
        elif profiler is SupportedProfiler.centrifuge:
            from .centrifuge import CentrifugeProfileStandardisationService

            return CentrifugeProfileStandardisationService
        elif profiler is SupportedProfiler.diamond:
            from .diamond import DiamondProfileStandardisationService

            return DiamondProfileStandardisationService
        elif profiler is SupportedProfiler.kaiju:
            from .kaiju import KaijuProfileStandardisationService

            return KaijuProfileStandardisationService
        elif profiler is SupportedProfiler.kraken2:
            from .kraken2 import Kraken2ProfileStandardisationService

            return Kraken2ProfileStandardisationService
        elif profiler is SupportedProfiler.krakenuniq:
            from .krakenuniq import KrakenUniqProfileStandardisationService

            return KrakenUniqProfileStandardisationService
        elif profiler is SupportedProfiler.megan6:
            from .megan6 import Megan6ProfileStandardisationService

            return Megan6ProfileStandardisationService
        elif profiler is SupportedProfiler.motus:
            from .motus import MotusProfileStandardisationService

            return MotusProfileStandardisationService
        elif profiler is SupportedProfiler.metaphlan:
            from .metaphlan import MetaphlanProfileStandardisationService

            return MetaphlanProfileStandardisationService
        elif profiler is SupportedProfiler.ganon:
            from .ganon import GanonProfileStandardisationService

            return GanonProfileStandardisationService
        elif profiler is SupportedProfiler.kmcp:
            from .kmcp import KMCPProfileStandardisationService

            return KMCPProfileStandardisationService

        else:
            raise ValueError("Unexpected")

    @classmethod
    def standard_profile_writer(
        cls, file_format: StandardProfileFileFormat
    ) -> Type[StandardProfileWriter]:
        """Return a standard profile writer of the correct type."""
        if file_format is StandardProfileFileFormat.TSV:
            from .standard_profile_writer.tsv_standard_profile_writer import (
                TSVStandardProfileWriter,
            )

            return TSVStandardProfileWriter
        elif file_format is StandardProfileFileFormat.CSV:
            from .standard_profile_writer.csv_standard_profile_writer import (
                CSVStandardProfileWriter,
            )

            return CSVStandardProfileWriter
        elif file_format is StandardProfileFileFormat.XLSX:
            from .standard_profile_writer.xlsx_standard_profile_writer import (
                XLSXStandardProfileWriter,
            )

            return XLSXStandardProfileWriter
        elif file_format is StandardProfileFileFormat.ODS:
            from .standard_profile_writer.ods_standard_profile_writer import (
                ODSStandardProfileWriter,
            )

            return ODSStandardProfileWriter
        elif file_format is StandardProfileFileFormat.arrow:
            from .standard_profile_writer.arrow_standard_profile_writer import (
                ArrowStandardProfileWriter,
            )

            return ArrowStandardProfileWriter
        elif file_format is StandardProfileFileFormat.parquet:
            from .standard_profile_writer.parquet_standard_profile_writer import (
                ParquetStandardProfileWriter,
            )

            return ParquetStandardProfileWriter
        else:
            ValueError(
                f"The given file format {file_format.name} is not a supported tidy "
                f"observation table writer format."
            )

    @classmethod
    def table_reader(cls, file_format: TableReaderFileFormat) -> Type[TableReader]:
        """Return a table reader of the correct type."""
        if file_format is TableReaderFileFormat.TSV:
            from .table_reader.tsv_table_reader import TSVTableReader

            return TSVTableReader
        elif file_format is TableReaderFileFormat.CSV:
            from .table_reader.csv_table_reader import CSVTableReader

            return CSVTableReader
        elif file_format is TableReaderFileFormat.XLSX:
            from .table_reader.xlsx_table_reader import XLSXTableReader

            return XLSXTableReader
        elif file_format is TableReaderFileFormat.ODS:
            from .table_reader.ods_table_reader import ODSTableReader

            return ODSTableReader
        elif file_format is TableReaderFileFormat.arrow:
            from .table_reader.arrow_table_reader import ArrowTableReader

            return ArrowTableReader
        elif file_format is TableReaderFileFormat.parquet:
            from .table_reader.parquet_table_reader import ParquetTableReader

            return ParquetTableReader
        else:
            ValueError(
                f"The given file format {file_format.name} is not a supported table "
                f"reader format."
            )

    @classmethod
    def tidy_observation_table_writer(
        cls, file_format: TidyObservationTableFileFormat
    ) -> Type[TidyObservationTableWriter]:
        """Return a tidy table writer of the correct type."""
        if file_format is TidyObservationTableFileFormat.TSV:
            from .tidy_observation_table_writer.tsv_table_writer import (
                TSVTidyObservationTableWriter,
            )

            return TSVTidyObservationTableWriter
        elif file_format is TidyObservationTableFileFormat.CSV:
            from .tidy_observation_table_writer.csv_table_writer import (
                CSVTidyObservationTableWriter,
            )

            return CSVTidyObservationTableWriter
        elif file_format is TidyObservationTableFileFormat.XLSX:
            from .tidy_observation_table_writer.xlsx_table_writer import (
                XLSXTidyObservationTableWriter,
            )

            return XLSXTidyObservationTableWriter
        elif file_format is TidyObservationTableFileFormat.ODS:
            from .tidy_observation_table_writer.ods_table_writer import (
                ODSTidyObservationTableWriter,
            )

            return ODSTidyObservationTableWriter
        elif file_format is TidyObservationTableFileFormat.arrow:
            from .tidy_observation_table_writer.arrow_table_writer import (
                ArrowTidyObservationTableWriter,
            )

            return ArrowTidyObservationTableWriter
        elif file_format is TidyObservationTableFileFormat.parquet:
            from .tidy_observation_table_writer.parquet_table_writer import (
                ParquetTidyObservationTableWriter,
            )

            return ParquetTidyObservationTableWriter
        else:
            ValueError(
                f"The given file format {file_format.name} is not a supported tidy "
                f"observation table writer format."
            )

    @classmethod
    def wide_observation_table_writer(
        cls, file_format: WideObservationTableFileFormat
    ) -> Type[WideObservationTableWriter]:
        """Return a writer for wide observation tables in the specified format."""
        if file_format is WideObservationTableFileFormat.TSV:
            from .wide_observation_table_writer.tsv_wide_observation_table_writer import (
                TSVWideObservationTableWriter,
            )

            return TSVWideObservationTableWriter
        elif file_format is WideObservationTableFileFormat.CSV:
            from .wide_observation_table_writer.csv_wide_observation_table_writer import (
                CSVWideObservationTableWriter,
            )

            return CSVWideObservationTableWriter
        elif file_format is WideObservationTableFileFormat.XLSX:
            from .wide_observation_table_writer.xlsx_wide_observation_table_writer import (
                XLSXWideObservationTableWriter,
            )

            return XLSXWideObservationTableWriter
        elif file_format is WideObservationTableFileFormat.ODS:
            from .wide_observation_table_writer.ods_wide_observation_table_writer import (
                ODSWideObservationTableWriter,
            )

            return ODSWideObservationTableWriter
        elif file_format is WideObservationTableFileFormat.arrow:
            from .wide_observation_table_writer.arrow_wide_observation_table_writer import (
                ArrowWideObservationTableWriter,
            )

            return ArrowWideObservationTableWriter
        elif file_format is WideObservationTableFileFormat.parquet:
            from .wide_observation_table_writer.parquet_wide_observation_table_writer import (
                ParquetWideObservationTableWriter,
            )

            return ParquetWideObservationTableWriter
        elif file_format is WideObservationTableFileFormat.BIOM:
            from .wide_observation_table_writer.biom_wide_observation_table_writer import (
                BIOMWideObservationTableWriter,
            )

            return BIOMWideObservationTableWriter
        else:
            ValueError(
                f"The given file format {file_format.name} is not a supported "
                f"observation matrix writer format."
            )
Functions
profile_reader(profiler: SupportedProfiler) -> Type[ProfileReader] classmethod

Return a profile reader of the correct type.

Source code in src/taxpasta/infrastructure/application/application_service_registry.py
@classmethod
def profile_reader(cls, profiler: SupportedProfiler) -> Type[ProfileReader]:
    """Return a profile reader of the correct type."""
    if profiler is SupportedProfiler.bracken:
        from .bracken import BrackenProfileReader

        return BrackenProfileReader
    elif profiler is SupportedProfiler.centrifuge:
        from .centrifuge import CentrifugeProfileReader

        return CentrifugeProfileReader
    elif profiler is SupportedProfiler.diamond:
        from .diamond import DiamondProfileReader

        return DiamondProfileReader
    elif profiler is SupportedProfiler.ganon:
        from .ganon import GanonProfileReader

        return GanonProfileReader
    elif profiler is SupportedProfiler.kaiju:
        from .kaiju import KaijuProfileReader

        return KaijuProfileReader
    elif profiler is SupportedProfiler.kmcp:
        from .kmcp import KMCPProfileReader

        return KMCPProfileReader
    elif profiler is SupportedProfiler.kraken2:
        from .kraken2 import Kraken2ProfileReader

        return Kraken2ProfileReader
    elif profiler is SupportedProfiler.krakenuniq:
        from .krakenuniq import KrakenUniqProfileReader

        return KrakenUniqProfileReader
    elif profiler is SupportedProfiler.megan6:
        from .megan6 import Megan6ProfileReader

        return Megan6ProfileReader
    elif profiler is SupportedProfiler.metaphlan:
        from .metaphlan import MetaphlanProfileReader

        return MetaphlanProfileReader
    elif profiler is SupportedProfiler.motus:
        from .motus import MotusProfileReader

        return MotusProfileReader
profile_standardisation_service(profiler: SupportedProfiler) -> Type[ProfileStandardisationService] classmethod

Return a profile standardisation service of the correct type.

Source code in src/taxpasta/infrastructure/application/application_service_registry.py
@classmethod
def profile_standardisation_service(
    cls, profiler: SupportedProfiler
) -> Type[ProfileStandardisationService]:
    """Return a profile standardisation service of the correct type."""
    if profiler is SupportedProfiler.bracken:
        from .bracken import BrackenProfileStandardisationService

        return BrackenProfileStandardisationService
    elif profiler is SupportedProfiler.centrifuge:
        from .centrifuge import CentrifugeProfileStandardisationService

        return CentrifugeProfileStandardisationService
    elif profiler is SupportedProfiler.diamond:
        from .diamond import DiamondProfileStandardisationService

        return DiamondProfileStandardisationService
    elif profiler is SupportedProfiler.kaiju:
        from .kaiju import KaijuProfileStandardisationService

        return KaijuProfileStandardisationService
    elif profiler is SupportedProfiler.kraken2:
        from .kraken2 import Kraken2ProfileStandardisationService

        return Kraken2ProfileStandardisationService
    elif profiler is SupportedProfiler.krakenuniq:
        from .krakenuniq import KrakenUniqProfileStandardisationService

        return KrakenUniqProfileStandardisationService
    elif profiler is SupportedProfiler.megan6:
        from .megan6 import Megan6ProfileStandardisationService

        return Megan6ProfileStandardisationService
    elif profiler is SupportedProfiler.motus:
        from .motus import MotusProfileStandardisationService

        return MotusProfileStandardisationService
    elif profiler is SupportedProfiler.metaphlan:
        from .metaphlan import MetaphlanProfileStandardisationService

        return MetaphlanProfileStandardisationService
    elif profiler is SupportedProfiler.ganon:
        from .ganon import GanonProfileStandardisationService

        return GanonProfileStandardisationService
    elif profiler is SupportedProfiler.kmcp:
        from .kmcp import KMCPProfileStandardisationService

        return KMCPProfileStandardisationService

    else:
        raise ValueError("Unexpected")
standard_profile_writer(file_format: StandardProfileFileFormat) -> Type[StandardProfileWriter] classmethod

Return a standard profile writer of the correct type.

Source code in src/taxpasta/infrastructure/application/application_service_registry.py
@classmethod
def standard_profile_writer(
    cls, file_format: StandardProfileFileFormat
) -> Type[StandardProfileWriter]:
    """Return a standard profile writer of the correct type."""
    if file_format is StandardProfileFileFormat.TSV:
        from .standard_profile_writer.tsv_standard_profile_writer import (
            TSVStandardProfileWriter,
        )

        return TSVStandardProfileWriter
    elif file_format is StandardProfileFileFormat.CSV:
        from .standard_profile_writer.csv_standard_profile_writer import (
            CSVStandardProfileWriter,
        )

        return CSVStandardProfileWriter
    elif file_format is StandardProfileFileFormat.XLSX:
        from .standard_profile_writer.xlsx_standard_profile_writer import (
            XLSXStandardProfileWriter,
        )

        return XLSXStandardProfileWriter
    elif file_format is StandardProfileFileFormat.ODS:
        from .standard_profile_writer.ods_standard_profile_writer import (
            ODSStandardProfileWriter,
        )

        return ODSStandardProfileWriter
    elif file_format is StandardProfileFileFormat.arrow:
        from .standard_profile_writer.arrow_standard_profile_writer import (
            ArrowStandardProfileWriter,
        )

        return ArrowStandardProfileWriter
    elif file_format is StandardProfileFileFormat.parquet:
        from .standard_profile_writer.parquet_standard_profile_writer import (
            ParquetStandardProfileWriter,
        )

        return ParquetStandardProfileWriter
    else:
        ValueError(
            f"The given file format {file_format.name} is not a supported tidy "
            f"observation table writer format."
        )
table_reader(file_format: TableReaderFileFormat) -> Type[TableReader] classmethod

Return a table reader of the correct type.

Source code in src/taxpasta/infrastructure/application/application_service_registry.py
@classmethod
def table_reader(cls, file_format: TableReaderFileFormat) -> Type[TableReader]:
    """Return a table reader of the correct type."""
    if file_format is TableReaderFileFormat.TSV:
        from .table_reader.tsv_table_reader import TSVTableReader

        return TSVTableReader
    elif file_format is TableReaderFileFormat.CSV:
        from .table_reader.csv_table_reader import CSVTableReader

        return CSVTableReader
    elif file_format is TableReaderFileFormat.XLSX:
        from .table_reader.xlsx_table_reader import XLSXTableReader

        return XLSXTableReader
    elif file_format is TableReaderFileFormat.ODS:
        from .table_reader.ods_table_reader import ODSTableReader

        return ODSTableReader
    elif file_format is TableReaderFileFormat.arrow:
        from .table_reader.arrow_table_reader import ArrowTableReader

        return ArrowTableReader
    elif file_format is TableReaderFileFormat.parquet:
        from .table_reader.parquet_table_reader import ParquetTableReader

        return ParquetTableReader
    else:
        ValueError(
            f"The given file format {file_format.name} is not a supported table "
            f"reader format."
        )
tidy_observation_table_writer(file_format: TidyObservationTableFileFormat) -> Type[TidyObservationTableWriter] classmethod

Return a tidy table writer of the correct type.

Source code in src/taxpasta/infrastructure/application/application_service_registry.py
@classmethod
def tidy_observation_table_writer(
    cls, file_format: TidyObservationTableFileFormat
) -> Type[TidyObservationTableWriter]:
    """Return a tidy table writer of the correct type."""
    if file_format is TidyObservationTableFileFormat.TSV:
        from .tidy_observation_table_writer.tsv_table_writer import (
            TSVTidyObservationTableWriter,
        )

        return TSVTidyObservationTableWriter
    elif file_format is TidyObservationTableFileFormat.CSV:
        from .tidy_observation_table_writer.csv_table_writer import (
            CSVTidyObservationTableWriter,
        )

        return CSVTidyObservationTableWriter
    elif file_format is TidyObservationTableFileFormat.XLSX:
        from .tidy_observation_table_writer.xlsx_table_writer import (
            XLSXTidyObservationTableWriter,
        )

        return XLSXTidyObservationTableWriter
    elif file_format is TidyObservationTableFileFormat.ODS:
        from .tidy_observation_table_writer.ods_table_writer import (
            ODSTidyObservationTableWriter,
        )

        return ODSTidyObservationTableWriter
    elif file_format is TidyObservationTableFileFormat.arrow:
        from .tidy_observation_table_writer.arrow_table_writer import (
            ArrowTidyObservationTableWriter,
        )

        return ArrowTidyObservationTableWriter
    elif file_format is TidyObservationTableFileFormat.parquet:
        from .tidy_observation_table_writer.parquet_table_writer import (
            ParquetTidyObservationTableWriter,
        )

        return ParquetTidyObservationTableWriter
    else:
        ValueError(
            f"The given file format {file_format.name} is not a supported tidy "
            f"observation table writer format."
        )
wide_observation_table_writer(file_format: WideObservationTableFileFormat) -> Type[WideObservationTableWriter] classmethod

Return a writer for wide observation tables in the specified format.

Source code in src/taxpasta/infrastructure/application/application_service_registry.py
@classmethod
def wide_observation_table_writer(
    cls, file_format: WideObservationTableFileFormat
) -> Type[WideObservationTableWriter]:
    """Return a writer for wide observation tables in the specified format."""
    if file_format is WideObservationTableFileFormat.TSV:
        from .wide_observation_table_writer.tsv_wide_observation_table_writer import (
            TSVWideObservationTableWriter,
        )

        return TSVWideObservationTableWriter
    elif file_format is WideObservationTableFileFormat.CSV:
        from .wide_observation_table_writer.csv_wide_observation_table_writer import (
            CSVWideObservationTableWriter,
        )

        return CSVWideObservationTableWriter
    elif file_format is WideObservationTableFileFormat.XLSX:
        from .wide_observation_table_writer.xlsx_wide_observation_table_writer import (
            XLSXWideObservationTableWriter,
        )

        return XLSXWideObservationTableWriter
    elif file_format is WideObservationTableFileFormat.ODS:
        from .wide_observation_table_writer.ods_wide_observation_table_writer import (
            ODSWideObservationTableWriter,
        )

        return ODSWideObservationTableWriter
    elif file_format is WideObservationTableFileFormat.arrow:
        from .wide_observation_table_writer.arrow_wide_observation_table_writer import (
            ArrowWideObservationTableWriter,
        )

        return ArrowWideObservationTableWriter
    elif file_format is WideObservationTableFileFormat.parquet:
        from .wide_observation_table_writer.parquet_wide_observation_table_writer import (
            ParquetWideObservationTableWriter,
        )

        return ParquetWideObservationTableWriter
    elif file_format is WideObservationTableFileFormat.BIOM:
        from .wide_observation_table_writer.biom_wide_observation_table_writer import (
            BIOMWideObservationTableWriter,
        )

        return BIOMWideObservationTableWriter
    else:
        ValueError(
            f"The given file format {file_format.name} is not a supported "
            f"observation matrix writer format."
        )
bracken
Classes
Modules
bracken_profile

Provide a description of the Bracken profile format.

Attributes
BRACKEN_FRACTION_TOLERANCE = 0.01 module-attribute
BRACKEN_FRACTION_TOTAL = 1.0 module-attribute
Classes
BrackenProfile

Bases: BaseDataFrameModel

Define the expected Bracken profile format.

Source code in src/taxpasta/infrastructure/application/bracken/bracken_profile.py
class BrackenProfile(BaseDataFrameModel):
    """Define the expected Bracken profile format."""

    name: Series[str] = pa.Field()
    taxonomy_id: Series[int] = pa.Field(ge=0)
    taxonomy_lvl: Series[str] = pa.Field()
    kraken_assigned_reads: Series[int] = pa.Field(ge=0)
    added_reads: Series[int] = pa.Field(ge=0)
    new_est_reads: Series[int] = pa.Field(ge=0)
    fraction_total_reads: Series[float] = pa.Field(ge=0.0, le=1.0)

    @pa.check("fraction_total_reads", name="compositionality", raise_warning=True)
    def check_compositionality(cls, fraction_total_reads: Series[float]) -> bool:
        """Check that the fractions of reads add up to one."""
        # Bracken reports fractions with five decimals but rounding errors accumulate.
        return fraction_total_reads.empty or bool(
            np.isclose(
                fraction_total_reads.sum(),
                BRACKEN_FRACTION_TOTAL,
                atol=BRACKEN_FRACTION_TOLERANCE,
            )
        )

    @pa.dataframe_check
    def check_added_reads_consistency(cls, profile: DataFrame) -> Series[bool]:
        """Check that Bracken added reads are consistent."""
        return (
            profile[cls.kraken_assigned_reads] + profile[cls.added_reads]
            == profile[cls.new_est_reads]
        )
Attributes
added_reads: Series[int] = pa.Field(ge=0) class-attribute instance-attribute
fraction_total_reads: Series[float] = pa.Field(ge=0.0, le=1.0) class-attribute instance-attribute
kraken_assigned_reads: Series[int] = pa.Field(ge=0) class-attribute instance-attribute
name: Series[str] = pa.Field() class-attribute instance-attribute
new_est_reads: Series[int] = pa.Field(ge=0) class-attribute instance-attribute
taxonomy_id: Series[int] = pa.Field(ge=0) class-attribute instance-attribute
taxonomy_lvl: Series[str] = pa.Field() class-attribute instance-attribute
Functions
check_added_reads_consistency(profile: DataFrame) -> Series[bool]

Check that Bracken added reads are consistent.

Source code in src/taxpasta/infrastructure/application/bracken/bracken_profile.py
@pa.dataframe_check
def check_added_reads_consistency(cls, profile: DataFrame) -> Series[bool]:
    """Check that Bracken added reads are consistent."""
    return (
        profile[cls.kraken_assigned_reads] + profile[cls.added_reads]
        == profile[cls.new_est_reads]
    )
check_compositionality(fraction_total_reads: Series[float]) -> bool

Check that the fractions of reads add up to one.

Source code in src/taxpasta/infrastructure/application/bracken/bracken_profile.py
@pa.check("fraction_total_reads", name="compositionality", raise_warning=True)
def check_compositionality(cls, fraction_total_reads: Series[float]) -> bool:
    """Check that the fractions of reads add up to one."""
    # Bracken reports fractions with five decimals but rounding errors accumulate.
    return fraction_total_reads.empty or bool(
        np.isclose(
            fraction_total_reads.sum(),
            BRACKEN_FRACTION_TOTAL,
            atol=BRACKEN_FRACTION_TOLERANCE,
        )
    )
bracken_profile_reader

Provide a reader for Bracken profiles.

Attributes Classes
BrackenProfileReader

Bases: ProfileReader

Define a reader for Bracken profiles.

Source code in src/taxpasta/infrastructure/application/bracken/bracken_profile_reader.py
class BrackenProfileReader(ProfileReader):
    """Define a reader for Bracken profiles."""

    @classmethod
    @raise_parser_warnings
    def read(cls, profile: BufferOrFilepath) -> DataFrame[BrackenProfile]:
        """
        Read a Bracken taxonomic profile from the given source.

        Args:
            profile: A source that contains a tab-separated taxonomic profile generated
                by Bracken.

        Returns:
            A data frame representation of the Bracken profile.

        """
        result = pd.read_table(
            filepath_or_buffer=profile,
            sep="\t",
            index_col=False,
            skipinitialspace=True,
        )
        cls._check_num_columns(result, BrackenProfile)
        return result
Functions
read(profile: BufferOrFilepath) -> DataFrame[BrackenProfile] classmethod

Read a Bracken taxonomic profile from the given source.

Parameters:

Name Type Description Default
profile BufferOrFilepath

A source that contains a tab-separated taxonomic profile generated by Bracken.

required

Returns:

Type Description
DataFrame[BrackenProfile]

A data frame representation of the Bracken profile.

Source code in src/taxpasta/infrastructure/application/bracken/bracken_profile_reader.py
@classmethod
@raise_parser_warnings
def read(cls, profile: BufferOrFilepath) -> DataFrame[BrackenProfile]:
    """
    Read a Bracken taxonomic profile from the given source.

    Args:
        profile: A source that contains a tab-separated taxonomic profile generated
            by Bracken.

    Returns:
        A data frame representation of the Bracken profile.

    """
    result = pd.read_table(
        filepath_or_buffer=profile,
        sep="\t",
        index_col=False,
        skipinitialspace=True,
    )
    cls._check_num_columns(result, BrackenProfile)
    return result
Functions
bracken_profile_standardisation_service

Provide a standardisation service for Bracken profiles.

Classes
BrackenProfileStandardisationService

Bases: ProfileStandardisationService

Define a standardisation service for Bracken profiles.

Source code in src/taxpasta/infrastructure/application/bracken/bracken_profile_standardisation_service.py
class BrackenProfileStandardisationService(ProfileStandardisationService):
    """Define a standardisation service for Bracken profiles."""

    @classmethod
    @pa.check_types(lazy=True)
    def transform(
        cls, profile: DataFrame[BrackenProfile]
    ) -> DataFrame[StandardProfile]:
        """
        Tidy up and standardize a given Bracken profile.

        Args:
            profile: A taxonomic profile generated by Bracken.

        Returns:
            A standardized profile.

        Raises:
            pandera.errors.SchemaErrors: If the given profile does not conform with the
                `BrackenProfile` or the transformed output does not conform with the
                `StandardProfile`.  # noqa: DAR402

        """
        return (
            profile[[BrackenProfile.taxonomy_id, BrackenProfile.new_est_reads]]
            .copy()
            .rename(
                columns={
                    BrackenProfile.taxonomy_id: StandardProfile.taxonomy_id,
                    BrackenProfile.new_est_reads: StandardProfile.count,
                }
            )
        )
Functions
transform(profile: DataFrame[BrackenProfile]) -> DataFrame[StandardProfile] classmethod

Tidy up and standardize a given Bracken profile.

Parameters:

Name Type Description Default
profile DataFrame[BrackenProfile]

A taxonomic profile generated by Bracken.

required

Returns:

Type Description
DataFrame[StandardProfile]

A standardized profile.

Raises:

Type Description
SchemaErrors

If the given profile does not conform with the BrackenProfile or the transformed output does not conform with the StandardProfile. # noqa: DAR402

Source code in src/taxpasta/infrastructure/application/bracken/bracken_profile_standardisation_service.py
@classmethod
@pa.check_types(lazy=True)
def transform(
    cls, profile: DataFrame[BrackenProfile]
) -> DataFrame[StandardProfile]:
    """
    Tidy up and standardize a given Bracken profile.

    Args:
        profile: A taxonomic profile generated by Bracken.

    Returns:
        A standardized profile.

    Raises:
        pandera.errors.SchemaErrors: If the given profile does not conform with the
            `BrackenProfile` or the transformed output does not conform with the
            `StandardProfile`.  # noqa: DAR402

    """
    return (
        profile[[BrackenProfile.taxonomy_id, BrackenProfile.new_est_reads]]
        .copy()
        .rename(
            columns={
                BrackenProfile.taxonomy_id: StandardProfile.taxonomy_id,
                BrackenProfile.new_est_reads: StandardProfile.count,
            }
        )
    )
centrifuge
Classes
Modules
centrifuge_profile

Provide a description of the centrifuge profile format.

Attributes
CENTRIFUGE_PERCENT_TOLERANCE = 1.0 module-attribute
CENTRIFUGE_PERCENT_TOTAL = 100.0 module-attribute
Classes
CentrifugeProfile

Bases: BaseDataFrameModel

Define the expected centrifuge profile format.

Source code in src/taxpasta/infrastructure/application/centrifuge/centrifuge_profile.py
class CentrifugeProfile(BaseDataFrameModel):
    """Define the expected centrifuge profile format."""

    percent: Series[float] = pa.Field(ge=0.0, le=100.0)
    clade_assigned_reads: Series[int] = pa.Field(ge=0)
    direct_assigned_reads: Series[int] = pa.Field(ge=0)
    taxonomy_level: Series[str] = pa.Field()
    taxonomy_id: Series[int] = pa.Field(ge=0)
    name: Series[str] = pa.Field()

    @pa.check("percent", name="compositionality", raise_warning=True)
    def check_compositionality(cls, percent: Series[float]) -> bool:
        """Check that the percent of 'unclassified' and 'root' add up to a hundred."""
        return percent.empty or bool(
            np.isclose(
                percent[:2].sum(),
                CENTRIFUGE_PERCENT_TOTAL,
                atol=CENTRIFUGE_PERCENT_TOLERANCE,
            )
        )
Attributes
clade_assigned_reads: Series[int] = pa.Field(ge=0) class-attribute instance-attribute
direct_assigned_reads: Series[int] = pa.Field(ge=0) class-attribute instance-attribute
name: Series[str] = pa.Field() class-attribute instance-attribute
percent: Series[float] = pa.Field(ge=0.0, le=100.0) class-attribute instance-attribute
taxonomy_id: Series[int] = pa.Field(ge=0) class-attribute instance-attribute
taxonomy_level: Series[str] = pa.Field() class-attribute instance-attribute
Functions
check_compositionality(percent: Series[float]) -> bool

Check that the percent of 'unclassified' and 'root' add up to a hundred.

Source code in src/taxpasta/infrastructure/application/centrifuge/centrifuge_profile.py
@pa.check("percent", name="compositionality", raise_warning=True)
def check_compositionality(cls, percent: Series[float]) -> bool:
    """Check that the percent of 'unclassified' and 'root' add up to a hundred."""
    return percent.empty or bool(
        np.isclose(
            percent[:2].sum(),
            CENTRIFUGE_PERCENT_TOTAL,
            atol=CENTRIFUGE_PERCENT_TOLERANCE,
        )
    )
centrifuge_profile_reader

Provide a reader for Centrifuge profiles.

Attributes Classes
CentrifugeProfileReader

Bases: ProfileReader

Define a reader for centrifuge profiles.

Source code in src/taxpasta/infrastructure/application/centrifuge/centrifuge_profile_reader.py
class CentrifugeProfileReader(ProfileReader):
    """Define a reader for centrifuge profiles."""

    @classmethod
    @raise_parser_warnings
    def read(cls, profile: BufferOrFilepath) -> DataFrame[CentrifugeProfile]:
        """
        Read a centrifuge taxonomic profile from the given source.

        Args:
            profile: A source that contains a tab-separated taxonomic profile generated
                by centrifuge.

        Returns:
            A data frame representation of the centrifuge profile.

        """
        result = pd.read_table(
            filepath_or_buffer=profile,
            sep="\t",
            header=None,
            index_col=False,
            skipinitialspace=True,
        )
        cls._check_num_columns(result, CentrifugeProfile)
        result.columns = [
            CentrifugeProfile.percent,
            CentrifugeProfile.clade_assigned_reads,
            CentrifugeProfile.direct_assigned_reads,
            CentrifugeProfile.taxonomy_level,
            CentrifugeProfile.taxonomy_id,
            CentrifugeProfile.name,
        ]
        return result
Functions
read(profile: BufferOrFilepath) -> DataFrame[CentrifugeProfile] classmethod

Read a centrifuge taxonomic profile from the given source.

Parameters:

Name Type Description Default
profile BufferOrFilepath

A source that contains a tab-separated taxonomic profile generated by centrifuge.

required

Returns:

Type Description
DataFrame[CentrifugeProfile]

A data frame representation of the centrifuge profile.

Source code in src/taxpasta/infrastructure/application/centrifuge/centrifuge_profile_reader.py
@classmethod
@raise_parser_warnings
def read(cls, profile: BufferOrFilepath) -> DataFrame[CentrifugeProfile]:
    """
    Read a centrifuge taxonomic profile from the given source.

    Args:
        profile: A source that contains a tab-separated taxonomic profile generated
            by centrifuge.

    Returns:
        A data frame representation of the centrifuge profile.

    """
    result = pd.read_table(
        filepath_or_buffer=profile,
        sep="\t",
        header=None,
        index_col=False,
        skipinitialspace=True,
    )
    cls._check_num_columns(result, CentrifugeProfile)
    result.columns = [
        CentrifugeProfile.percent,
        CentrifugeProfile.clade_assigned_reads,
        CentrifugeProfile.direct_assigned_reads,
        CentrifugeProfile.taxonomy_level,
        CentrifugeProfile.taxonomy_id,
        CentrifugeProfile.name,
    ]
    return result
Functions
centrifuge_profile_standardisation_service

Provide a standardisation service for centrifuge profiles.

Attributes
logger = logging.getLogger(__name__) module-attribute
Classes
CentrifugeProfileStandardisationService

Bases: ProfileStandardisationService

Define a standardisation service for centrifuge profiles.

Source code in src/taxpasta/infrastructure/application/centrifuge/centrifuge_profile_standardisation_service.py
class CentrifugeProfileStandardisationService(ProfileStandardisationService):
    """Define a standardisation service for centrifuge profiles."""

    @classmethod
    @pa.check_types(lazy=True)
    def transform(
        cls, profile: DataFrame[CentrifugeProfile]
    ) -> DataFrame[StandardProfile]:
        """
        Tidy up and standardize a given centrifuge profile.

        Args:
            profile: A taxonomic profile generated by centrifuge.

        Returns:
            A standardized profile.

        """
        return (
            profile[
                [CentrifugeProfile.taxonomy_id, CentrifugeProfile.direct_assigned_reads]
            ]
            .copy()
            .rename(
                columns={
                    CentrifugeProfile.taxonomy_id: StandardProfile.taxonomy_id,
                    CentrifugeProfile.direct_assigned_reads: StandardProfile.count,
                }
            )
        )
Functions
transform(profile: DataFrame[CentrifugeProfile]) -> DataFrame[StandardProfile] classmethod

Tidy up and standardize a given centrifuge profile.

Parameters:

Name Type Description Default
profile DataFrame[CentrifugeProfile]

A taxonomic profile generated by centrifuge.

required

Returns:

Type Description
DataFrame[StandardProfile]

A standardized profile.

Source code in src/taxpasta/infrastructure/application/centrifuge/centrifuge_profile_standardisation_service.py
@classmethod
@pa.check_types(lazy=True)
def transform(
    cls, profile: DataFrame[CentrifugeProfile]
) -> DataFrame[StandardProfile]:
    """
    Tidy up and standardize a given centrifuge profile.

    Args:
        profile: A taxonomic profile generated by centrifuge.

    Returns:
        A standardized profile.

    """
    return (
        profile[
            [CentrifugeProfile.taxonomy_id, CentrifugeProfile.direct_assigned_reads]
        ]
        .copy()
        .rename(
            columns={
                CentrifugeProfile.taxonomy_id: StandardProfile.taxonomy_id,
                CentrifugeProfile.direct_assigned_reads: StandardProfile.count,
            }
        )
    )
diamond
Classes
Modules
diamond_profile

Provide a description of the diamond profile format.

Classes
DiamondProfile

Bases: BaseDataFrameModel

Define the expected diamond profile format.

Source code in src/taxpasta/infrastructure/application/diamond/diamond_profile.py
class DiamondProfile(BaseDataFrameModel):
    """Define the expected diamond profile format."""

    query_id: Series[str] = pa.Field()
    taxonomy_id: Series[int] = pa.Field(ge=0)
    e_value: Series[float] = pa.Field(ge=0.0, le=1.0)
Attributes
e_value: Series[float] = pa.Field(ge=0.0, le=1.0) class-attribute instance-attribute
query_id: Series[str] = pa.Field() class-attribute instance-attribute
taxonomy_id: Series[int] = pa.Field(ge=0) class-attribute instance-attribute
diamond_profile_reader

Provide a reader for diamond profiles.

Attributes Classes
DiamondProfileReader

Bases: ProfileReader

Define a reader for Diamond profiles.

Source code in src/taxpasta/infrastructure/application/diamond/diamond_profile_reader.py
class DiamondProfileReader(ProfileReader):
    """Define a reader for Diamond profiles."""

    @classmethod
    @raise_parser_warnings
    def read(cls, profile: BufferOrFilepath) -> DataFrame[DiamondProfile]:
        """Read a diamond taxonomic profile from a file."""
        result = pd.read_table(
            filepath_or_buffer=profile,
            sep="\t",
            header=None,
            index_col=False,
            dtype={2: float},
        )
        cls._check_num_columns(result, DiamondProfile)
        result.columns = [
            DiamondProfile.query_id,
            DiamondProfile.taxonomy_id,
            DiamondProfile.e_value,
        ]
        return result
Functions
read(profile: BufferOrFilepath) -> DataFrame[DiamondProfile] classmethod

Read a diamond taxonomic profile from a file.

Source code in src/taxpasta/infrastructure/application/diamond/diamond_profile_reader.py
@classmethod
@raise_parser_warnings
def read(cls, profile: BufferOrFilepath) -> DataFrame[DiamondProfile]:
    """Read a diamond taxonomic profile from a file."""
    result = pd.read_table(
        filepath_or_buffer=profile,
        sep="\t",
        header=None,
        index_col=False,
        dtype={2: float},
    )
    cls._check_num_columns(result, DiamondProfile)
    result.columns = [
        DiamondProfile.query_id,
        DiamondProfile.taxonomy_id,
        DiamondProfile.e_value,
    ]
    return result
Functions
diamond_profile_standardisation_service

Provide a standardisation service for diamond profiles.

Classes
DiamondProfileStandardisationService

Bases: ProfileStandardisationService

Define a standardisation service for diamond profiles.

Source code in src/taxpasta/infrastructure/application/diamond/diamond_profile_standardisation_service.py
class DiamondProfileStandardisationService(ProfileStandardisationService):
    """Define a standardisation service for diamond profiles."""

    @classmethod
    @pa.check_types(lazy=True)
    def transform(
        cls, profile: DataFrame[DiamondProfile]
    ) -> DataFrame[StandardProfile]:
        """
        Tidy up and standardize a given diamond profile.

        Args:
            profile: A taxonomic profile generated by diamond.

        Returns:
            A standardized profile.

        """
        # Sum up occurrences of taxonomy identifiers to yield read count.
        return (
            profile[[DiamondProfile.taxonomy_id]]
            .groupby(DiamondProfile.taxonomy_id, sort=False)
            .size()
            .reset_index()
            .rename(
                columns={
                    DiamondProfile.taxonomy_id: StandardProfile.taxonomy_id,
                    0: StandardProfile.count,
                }
            )
        )
Functions
transform(profile: DataFrame[DiamondProfile]) -> DataFrame[StandardProfile] classmethod

Tidy up and standardize a given diamond profile.

Parameters:

Name Type Description Default
profile DataFrame[DiamondProfile]

A taxonomic profile generated by diamond.

required

Returns:

Type Description
DataFrame[StandardProfile]

A standardized profile.

Source code in src/taxpasta/infrastructure/application/diamond/diamond_profile_standardisation_service.py
@classmethod
@pa.check_types(lazy=True)
def transform(
    cls, profile: DataFrame[DiamondProfile]
) -> DataFrame[StandardProfile]:
    """
    Tidy up and standardize a given diamond profile.

    Args:
        profile: A taxonomic profile generated by diamond.

    Returns:
        A standardized profile.

    """
    # Sum up occurrences of taxonomy identifiers to yield read count.
    return (
        profile[[DiamondProfile.taxonomy_id]]
        .groupby(DiamondProfile.taxonomy_id, sort=False)
        .size()
        .reset_index()
        .rename(
            columns={
                DiamondProfile.taxonomy_id: StandardProfile.taxonomy_id,
                0: StandardProfile.count,
            }
        )
    )
ganon
Classes
Modules
ganon_profile

Provide a description of the ganon profile format.

Attributes
GANON_PERCENT_TOLERANCE = 1.0 module-attribute
GANON_PERCENT_TOTAL = 100.0 module-attribute
Classes
GanonProfile

Bases: BaseDataFrameModel

Define the expected ganon profile format.

Source code in src/taxpasta/infrastructure/application/ganon/ganon_profile.py
class GanonProfile(BaseDataFrameModel):
    """Define the expected ganon profile format."""

    rank: Series[str] = pa.Field()
    target: Series[str] = pa.Field()
    lineage: Series[str] = pa.Field()
    name: Series[str] = pa.Field()
    number_unique: Series[int] = pa.Field(ge=0)
    number_shared: Series[int] = pa.Field(ge=0)
    number_children: Series[int] = pa.Field(ge=0)
    number_cumulative: Series[int] = pa.Field(ge=0)
    percent_cumulative: Series[float] = pa.Field(ge=0.0, le=100.0)

    @pa.dataframe_check(name="compositionality", raise_warning=True)
    def check_compositionality(cls, profile: pd.DataFrame) -> bool:
        """Check that the percent of 'unclassified' and 'root' add up to a hundred."""
        # Ganon reports percentage to 5 decimal places, but rounding errors do add up.
        return profile.empty or bool(
            np.isclose(
                profile.loc[
                    profile[cls.rank].isin(["unclassified", "root"]),
                    cls.percent_cumulative,
                ].sum(),
                GANON_PERCENT_TOTAL,
                atol=GANON_PERCENT_TOLERANCE,
            )
        )
Attributes
lineage: Series[str] = pa.Field() class-attribute instance-attribute
name: Series[str] = pa.Field() class-attribute instance-attribute
number_children: Series[int] = pa.Field(ge=0) class-attribute instance-attribute
number_cumulative: Series[int] = pa.Field(ge=0) class-attribute instance-attribute
number_shared: Series[int] = pa.Field(ge=0) class-attribute instance-attribute
number_unique: Series[int] = pa.Field(ge=0) class-attribute instance-attribute
percent_cumulative: Series[float] = pa.Field(ge=0.0, le=100.0) class-attribute instance-attribute
rank: Series[str] = pa.Field() class-attribute instance-attribute
target: Series[str] = pa.Field() class-attribute instance-attribute
Functions
check_compositionality(profile: pd.DataFrame) -> bool

Check that the percent of 'unclassified' and 'root' add up to a hundred.

Source code in src/taxpasta/infrastructure/application/ganon/ganon_profile.py
@pa.dataframe_check(name="compositionality", raise_warning=True)
def check_compositionality(cls, profile: pd.DataFrame) -> bool:
    """Check that the percent of 'unclassified' and 'root' add up to a hundred."""
    # Ganon reports percentage to 5 decimal places, but rounding errors do add up.
    return profile.empty or bool(
        np.isclose(
            profile.loc[
                profile[cls.rank].isin(["unclassified", "root"]),
                cls.percent_cumulative,
            ].sum(),
            GANON_PERCENT_TOTAL,
            atol=GANON_PERCENT_TOLERANCE,
        )
    )
ganon_profile_reader

Provide a reader for ganon profiles.

Attributes Classes
GanonProfileReader

Bases: ProfileReader

Define a reader for ganon profiles.

Source code in src/taxpasta/infrastructure/application/ganon/ganon_profile_reader.py
class GanonProfileReader(ProfileReader):
    """Define a reader for ganon profiles."""

    @classmethod
    @raise_parser_warnings
    def read(cls, profile: BufferOrFilepath) -> DataFrame[GanonProfile]:
        """
        Read a ganon taxonomic profile from the given source.

        Args:
            profile: A source that contains a tab-separated taxonomic profile generated
                by ganon.

        Returns:
            A data frame representation of the ganon profile.

        """
        result = pd.read_table(
            filepath_or_buffer=profile,
            sep="\t",
            header=None,
            index_col=False,
            skipinitialspace=True,
        )
        cls._check_num_columns(result, GanonProfile)
        result.columns = [
            GanonProfile.rank,
            GanonProfile.target,
            GanonProfile.lineage,
            GanonProfile.name,
            GanonProfile.number_unique,
            GanonProfile.number_shared,
            GanonProfile.number_children,
            GanonProfile.number_cumulative,
            GanonProfile.percent_cumulative,
        ]
        return result
Functions
read(profile: BufferOrFilepath) -> DataFrame[GanonProfile] classmethod

Read a ganon taxonomic profile from the given source.

Parameters:

Name Type Description Default
profile BufferOrFilepath

A source that contains a tab-separated taxonomic profile generated by ganon.

required

Returns:

Type Description
DataFrame[GanonProfile]

A data frame representation of the ganon profile.

Source code in src/taxpasta/infrastructure/application/ganon/ganon_profile_reader.py
@classmethod
@raise_parser_warnings
def read(cls, profile: BufferOrFilepath) -> DataFrame[GanonProfile]:
    """
    Read a ganon taxonomic profile from the given source.

    Args:
        profile: A source that contains a tab-separated taxonomic profile generated
            by ganon.

    Returns:
        A data frame representation of the ganon profile.

    """
    result = pd.read_table(
        filepath_or_buffer=profile,
        sep="\t",
        header=None,
        index_col=False,
        skipinitialspace=True,
    )
    cls._check_num_columns(result, GanonProfile)
    result.columns = [
        GanonProfile.rank,
        GanonProfile.target,
        GanonProfile.lineage,
        GanonProfile.name,
        GanonProfile.number_unique,
        GanonProfile.number_shared,
        GanonProfile.number_children,
        GanonProfile.number_cumulative,
        GanonProfile.percent_cumulative,
    ]
    return result
Functions
ganon_profile_standardisation_service

Provide a standardisation service for ganon profiles.

Attributes
logger = logging.getLogger(__name__) module-attribute
Classes
GanonProfileStandardisationService

Bases: ProfileStandardisationService

Define a standardisation service for ganon profiles.

Source code in src/taxpasta/infrastructure/application/ganon/ganon_profile_standardisation_service.py
class GanonProfileStandardisationService(ProfileStandardisationService):
    """Define a standardisation service for ganon profiles."""

    @classmethod
    @pa.check_types(lazy=True)
    def transform(cls, profile: DataFrame[GanonProfile]) -> DataFrame[StandardProfile]:
        """
        Tidy up and standardize a given ganon profile.

        Args:
            profile: A taxonomic profile generated by ganon.

        Returns:
            A standardized profile.

        """
        # Select unclassified entries, rename columns, assign taxonomy ID zero, and
        #  sum up counts.
        unclassified = (
            profile.loc[
                profile[GanonProfile.target] == "-",
                [GanonProfile.target, GanonProfile.number_unique],
            ]
            .copy()
            .rename(
                columns={
                    GanonProfile.target: StandardProfile.taxonomy_id,
                    GanonProfile.number_unique: StandardProfile.count,
                }
            )
            .assign(**{StandardProfile.taxonomy_id: 0})
            .groupby(StandardProfile.taxonomy_id, as_index=False, sort=False)
            .sum()
        )
        # Select classified entries, rename columns, and convert taxonomy ID to integer.
        classified = (
            profile.loc[
                profile[GanonProfile.target] != "-",
                [GanonProfile.target, GanonProfile.number_unique],
            ]
            .copy()
            .rename(
                columns={
                    GanonProfile.target: StandardProfile.taxonomy_id,
                    GanonProfile.number_unique: StandardProfile.count,
                }
            )
            .assign(
                **{
                    StandardProfile.taxonomy_id: lambda df: df[
                        StandardProfile.taxonomy_id
                    ].astype(int)
                }
            )
        )
        return pd.concat([unclassified, classified], ignore_index=True)
Functions
transform(profile: DataFrame[GanonProfile]) -> DataFrame[StandardProfile] classmethod

Tidy up and standardize a given ganon profile.

Parameters:

Name Type Description Default
profile DataFrame[GanonProfile]

A taxonomic profile generated by ganon.

required

Returns:

Type Description
DataFrame[StandardProfile]

A standardized profile.

Source code in src/taxpasta/infrastructure/application/ganon/ganon_profile_standardisation_service.py
@classmethod
@pa.check_types(lazy=True)
def transform(cls, profile: DataFrame[GanonProfile]) -> DataFrame[StandardProfile]:
    """
    Tidy up and standardize a given ganon profile.

    Args:
        profile: A taxonomic profile generated by ganon.

    Returns:
        A standardized profile.

    """
    # Select unclassified entries, rename columns, assign taxonomy ID zero, and
    #  sum up counts.
    unclassified = (
        profile.loc[
            profile[GanonProfile.target] == "-",
            [GanonProfile.target, GanonProfile.number_unique],
        ]
        .copy()
        .rename(
            columns={
                GanonProfile.target: StandardProfile.taxonomy_id,
                GanonProfile.number_unique: StandardProfile.count,
            }
        )
        .assign(**{StandardProfile.taxonomy_id: 0})
        .groupby(StandardProfile.taxonomy_id, as_index=False, sort=False)
        .sum()
    )
    # Select classified entries, rename columns, and convert taxonomy ID to integer.
    classified = (
        profile.loc[
            profile[GanonProfile.target] != "-",
            [GanonProfile.target, GanonProfile.number_unique],
        ]
        .copy()
        .rename(
            columns={
                GanonProfile.target: StandardProfile.taxonomy_id,
                GanonProfile.number_unique: StandardProfile.count,
            }
        )
        .assign(
            **{
                StandardProfile.taxonomy_id: lambda df: df[
                    StandardProfile.taxonomy_id
                ].astype(int)
            }
        )
    )
    return pd.concat([unclassified, classified], ignore_index=True)
kaiju
Classes
Modules
kaiju_profile

Provide a description of the kaiju profile format.

Attributes
KAIJU_PERCENT_TOLERANCE = 1.0 module-attribute
KAIJU_PERCENT_TOTAL = 100.0 module-attribute
Classes
KaijuProfile

Bases: BaseDataFrameModel

Define the expected kaiju profile format.

Source code in src/taxpasta/infrastructure/application/kaiju/kaiju_profile.py
class KaijuProfile(BaseDataFrameModel):
    """Define the expected kaiju profile format."""

    file: Series[str] = pa.Field()
    percent: Series[float] = pa.Field(ge=0.0, le=100.0)
    reads: Series[int] = pa.Field(ge=0)
    taxon_id: Series[pd.Int64Dtype] = pa.Field(nullable=True)
    taxon_name: Series[str] = pa.Field()

    @pa.check("percent", name="compositionality", raise_warning=True)
    def check_compositionality(cls, percent: Series[float]) -> bool:
        """Check that the percentages add up to a hundred."""
        # Kaiju reports percentages with sixth decimals
        return percent.empty or bool(
            np.isclose(percent.sum(), KAIJU_PERCENT_TOTAL, atol=KAIJU_PERCENT_TOLERANCE)
        )

    @pa.check("file", name="unique_filename")
    def check_unique_filename(cls, file_col: Series[str]) -> bool:
        """Check that Kaiju filename is unique."""
        return file_col.empty or file_col.nunique() == 1
Attributes
file: Series[str] = pa.Field() class-attribute instance-attribute
percent: Series[float] = pa.Field(ge=0.0, le=100.0) class-attribute instance-attribute
reads: Series[int] = pa.Field(ge=0) class-attribute instance-attribute
taxon_id: Series[pd.Int64Dtype] = pa.Field(nullable=True) class-attribute instance-attribute
taxon_name: Series[str] = pa.Field() class-attribute instance-attribute
Functions
check_compositionality(percent: Series[float]) -> bool

Check that the percentages add up to a hundred.

Source code in src/taxpasta/infrastructure/application/kaiju/kaiju_profile.py
@pa.check("percent", name="compositionality", raise_warning=True)
def check_compositionality(cls, percent: Series[float]) -> bool:
    """Check that the percentages add up to a hundred."""
    # Kaiju reports percentages with sixth decimals
    return percent.empty or bool(
        np.isclose(percent.sum(), KAIJU_PERCENT_TOTAL, atol=KAIJU_PERCENT_TOLERANCE)
    )
check_unique_filename(file_col: Series[str]) -> bool

Check that Kaiju filename is unique.

Source code in src/taxpasta/infrastructure/application/kaiju/kaiju_profile.py
@pa.check("file", name="unique_filename")
def check_unique_filename(cls, file_col: Series[str]) -> bool:
    """Check that Kaiju filename is unique."""
    return file_col.empty or file_col.nunique() == 1
kaiju_profile_reader

Provide a reader for kaiju profiles.

Attributes Classes
KaijuProfileReader

Bases: ProfileReader

Define a reader for kaiju profiles.

Source code in src/taxpasta/infrastructure/application/kaiju/kaiju_profile_reader.py
class KaijuProfileReader(ProfileReader):
    """Define a reader for kaiju profiles."""

    @classmethod
    @raise_parser_warnings
    def read(cls, profile: BufferOrFilepath) -> DataFrame[KaijuProfile]:
        """
        Read a kaiju taxonomic profile from the given source.

        Args:
            profile: A source that contains a tab-separated taxonomic profile generated
                by kaiju.

        Returns:
            A data frame representation of the kaiju profile.

        """
        result = pd.read_table(
            filepath_or_buffer=profile,
            sep="\t",
            header=0,
            index_col=False,
            dtype={KaijuProfile.taxon_id: "Int64"},
        )
        cls._check_num_columns(result, KaijuProfile)
        return result
Functions
read(profile: BufferOrFilepath) -> DataFrame[KaijuProfile] classmethod

Read a kaiju taxonomic profile from the given source.

Parameters:

Name Type Description Default
profile BufferOrFilepath

A source that contains a tab-separated taxonomic profile generated by kaiju.

required

Returns:

Type Description
DataFrame[KaijuProfile]

A data frame representation of the kaiju profile.

Source code in src/taxpasta/infrastructure/application/kaiju/kaiju_profile_reader.py
@classmethod
@raise_parser_warnings
def read(cls, profile: BufferOrFilepath) -> DataFrame[KaijuProfile]:
    """
    Read a kaiju taxonomic profile from the given source.

    Args:
        profile: A source that contains a tab-separated taxonomic profile generated
            by kaiju.

    Returns:
        A data frame representation of the kaiju profile.

    """
    result = pd.read_table(
        filepath_or_buffer=profile,
        sep="\t",
        header=0,
        index_col=False,
        dtype={KaijuProfile.taxon_id: "Int64"},
    )
    cls._check_num_columns(result, KaijuProfile)
    return result
Functions
kaiju_profile_standardisation_service

Provide a standardisation service for kaiju profiles.

Classes
KaijuProfileStandardisationService

Bases: ProfileStandardisationService

Define a standardisation service for kaiju profiles.

Source code in src/taxpasta/infrastructure/application/kaiju/kaiju_profile_standardisation_service.py
class KaijuProfileStandardisationService(ProfileStandardisationService):
    """Define a standardisation service for kaiju profiles."""

    @classmethod
    @pa.check_types(lazy=True)
    def transform(cls, profile: DataFrame[KaijuProfile]) -> DataFrame[StandardProfile]:
        """
        Tidy up and standardize a given kaiju profile.

        Args:
            profile: A taxonomic profile generated by kaiju.

        Returns:
            A standardized profile.

        """
        temp = (
            profile[[KaijuProfile.taxon_id, KaijuProfile.reads]]
            .copy()
            .rename(
                columns={
                    KaijuProfile.taxon_id: StandardProfile.taxonomy_id,
                    KaijuProfile.reads: StandardProfile.count,
                }
            )
        )
        result = temp.loc[temp[StandardProfile.taxonomy_id].notna(), :].copy()
        result[StandardProfile.taxonomy_id] = result[
            StandardProfile.taxonomy_id
        ].astype(int)
        # Replace missing values (unclassified reads) with ID zero and sum reads.
        return pd.concat(
            [
                result,
                pd.DataFrame(
                    {
                        StandardProfile.taxonomy_id: [0],
                        StandardProfile.count: [
                            temp.loc[
                                temp[StandardProfile.taxonomy_id].isna(),
                                StandardProfile.count,
                            ].sum()
                        ],
                    },
                    dtype=int,
                ),
            ],
            ignore_index=True,
        )
Functions
transform(profile: DataFrame[KaijuProfile]) -> DataFrame[StandardProfile] classmethod

Tidy up and standardize a given kaiju profile.

Parameters:

Name Type Description Default
profile DataFrame[KaijuProfile]

A taxonomic profile generated by kaiju.

required

Returns:

Type Description
DataFrame[StandardProfile]

A standardized profile.

Source code in src/taxpasta/infrastructure/application/kaiju/kaiju_profile_standardisation_service.py
@classmethod
@pa.check_types(lazy=True)
def transform(cls, profile: DataFrame[KaijuProfile]) -> DataFrame[StandardProfile]:
    """
    Tidy up and standardize a given kaiju profile.

    Args:
        profile: A taxonomic profile generated by kaiju.

    Returns:
        A standardized profile.

    """
    temp = (
        profile[[KaijuProfile.taxon_id, KaijuProfile.reads]]
        .copy()
        .rename(
            columns={
                KaijuProfile.taxon_id: StandardProfile.taxonomy_id,
                KaijuProfile.reads: StandardProfile.count,
            }
        )
    )
    result = temp.loc[temp[StandardProfile.taxonomy_id].notna(), :].copy()
    result[StandardProfile.taxonomy_id] = result[
        StandardProfile.taxonomy_id
    ].astype(int)
    # Replace missing values (unclassified reads) with ID zero and sum reads.
    return pd.concat(
        [
            result,
            pd.DataFrame(
                {
                    StandardProfile.taxonomy_id: [0],
                    StandardProfile.count: [
                        temp.loc[
                            temp[StandardProfile.taxonomy_id].isna(),
                            StandardProfile.count,
                        ].sum()
                    ],
                },
                dtype=int,
            ),
        ],
        ignore_index=True,
    )
kmcp
Classes
Modules
kmcp_profile

Provide a description of the KMCP profile format.

Attributes
KMCP_PERCENT_TOLERANCE = 1.0 module-attribute
KMCP_PERCENT_TOTAL = 100.0 module-attribute
Classes
KMCPProfile

Bases: BaseDataFrameModel

Define the expected KMCP profile format.

Source code in src/taxpasta/infrastructure/application/kmcp/kmcp_profile.py
class KMCPProfile(BaseDataFrameModel):
    """Define the expected KMCP profile format."""

    reference: Series[str] = pa.Field(alias="ref")
    percentage: Series[float] = pa.Field(ge=0.0, le=100.0)
    coverage: Series[float] = pa.Field(ge=0.0, nullable=True)
    score: Series[float] = pa.Field(ge=0.0, le=100.0)
    chunks_fraction: Series[float] = pa.Field(ge=0.0, le=1.0, alias="chunksFrac")
    chunks_relative_depth: Series[str] = pa.Field(alias="chunksRelDepth")
    chunks_relative_depth_std: Series[float] = pa.Field(
        ge=0.0, nullable=True, alias="chunksRelDepthStd"
    )
    reads: Series[int] = pa.Field(ge=0)
    unique_reads: Series[int] = pa.Field(ge=0, alias="ureads")
    high_confidence_unique_reads: Series[int] = pa.Field(ge=0, alias="hicureads")
    reference_size: Series[int] = pa.Field(ge=0, alias="refsize")
    reference_name: Series[str] = pa.Field(nullable=True, alias="refname")
    taxid: Series[int] = pa.Field(ge=0)
    rank: Series[str] = pa.Field(nullable=True)
    taxonomic_name: Series[str] = pa.Field(nullable=True, alias="taxname")
    taxonomic_path: Series[str] = pa.Field(nullable=True, alias="taxpath")
    taxonomic_path_lineage: Series[str] = pa.Field(nullable=True, alias="taxpathsn")

    @pa.check("percentage", name="compositionality", raise_warning=True)
    def check_compositionality(cls, percentage: Series[float]) -> bool:
        """Check that the percentages add up to a hundred."""
        # KMCP profile reports percentages with sixth decimals
        return percentage.empty or bool(
            np.isclose(
                percentage.sum(), KMCP_PERCENT_TOTAL, atol=KMCP_PERCENT_TOLERANCE
            )
        )
Attributes
chunks_fraction: Series[float] = pa.Field(ge=0.0, le=1.0, alias='chunksFrac') class-attribute instance-attribute
chunks_relative_depth: Series[str] = pa.Field(alias='chunksRelDepth') class-attribute instance-attribute
chunks_relative_depth_std: Series[float] = pa.Field(ge=0.0, nullable=True, alias='chunksRelDepthStd') class-attribute instance-attribute
coverage: Series[float] = pa.Field(ge=0.0, nullable=True) class-attribute instance-attribute
high_confidence_unique_reads: Series[int] = pa.Field(ge=0, alias='hicureads') class-attribute instance-attribute
percentage: Series[float] = pa.Field(ge=0.0, le=100.0) class-attribute instance-attribute
rank: Series[str] = pa.Field(nullable=True) class-attribute instance-attribute
reads: Series[int] = pa.Field(ge=0) class-attribute instance-attribute
reference: Series[str] = pa.Field(alias='ref') class-attribute instance-attribute
reference_name: Series[str] = pa.Field(nullable=True, alias='refname') class-attribute instance-attribute
reference_size: Series[int] = pa.Field(ge=0, alias='refsize') class-attribute instance-attribute
score: Series[float] = pa.Field(ge=0.0, le=100.0) class-attribute instance-attribute
taxid: Series[int] = pa.Field(ge=0) class-attribute instance-attribute
taxonomic_name: Series[str] = pa.Field(nullable=True, alias='taxname') class-attribute instance-attribute
taxonomic_path: Series[str] = pa.Field(nullable=True, alias='taxpath') class-attribute instance-attribute
taxonomic_path_lineage: Series[str] = pa.Field(nullable=True, alias='taxpathsn') class-attribute instance-attribute
unique_reads: Series[int] = pa.Field(ge=0, alias='ureads') class-attribute instance-attribute
Functions
check_compositionality(percentage: Series[float]) -> bool

Check that the percentages add up to a hundred.

Source code in src/taxpasta/infrastructure/application/kmcp/kmcp_profile.py
@pa.check("percentage", name="compositionality", raise_warning=True)
def check_compositionality(cls, percentage: Series[float]) -> bool:
    """Check that the percentages add up to a hundred."""
    # KMCP profile reports percentages with sixth decimals
    return percentage.empty or bool(
        np.isclose(
            percentage.sum(), KMCP_PERCENT_TOTAL, atol=KMCP_PERCENT_TOLERANCE
        )
    )
kmcp_profile_reader

Provide a reader for KMCP profiles.

Attributes Classes
KMCPProfileReader

Bases: ProfileReader

Define a reader for KMCP profiles.

Source code in src/taxpasta/infrastructure/application/kmcp/kmcp_profile_reader.py
class KMCPProfileReader(ProfileReader):
    """Define a reader for KMCP profiles."""

    @classmethod
    @raise_parser_warnings
    def read(cls, profile: BufferOrFilepath) -> DataFrame[KMCPProfile]:
        """
        Read a KMCP taxonomic profile from the given source.

        Args:
            profile: A source that contains a tab-separated taxonomic profile generated
                by KMCP.

        Returns:
            A data frame representation of the KMCP profile.

        """
        result = pd.read_table(
            filepath_or_buffer=profile,
            sep="\t",
            header=0,
            index_col=False,
            dtype={
                KMCPProfile.chunks_relative_depth: str,
            },
        )
        cls._check_num_columns(result, KMCPProfile)
        return result
Functions
read(profile: BufferOrFilepath) -> DataFrame[KMCPProfile] classmethod

Read a KMCP taxonomic profile from the given source.

Parameters:

Name Type Description Default
profile BufferOrFilepath

A source that contains a tab-separated taxonomic profile generated by KMCP.

required

Returns:

Type Description
DataFrame[KMCPProfile]

A data frame representation of the KMCP profile.

Source code in src/taxpasta/infrastructure/application/kmcp/kmcp_profile_reader.py
@classmethod
@raise_parser_warnings
def read(cls, profile: BufferOrFilepath) -> DataFrame[KMCPProfile]:
    """
    Read a KMCP taxonomic profile from the given source.

    Args:
        profile: A source that contains a tab-separated taxonomic profile generated
            by KMCP.

    Returns:
        A data frame representation of the KMCP profile.

    """
    result = pd.read_table(
        filepath_or_buffer=profile,
        sep="\t",
        header=0,
        index_col=False,
        dtype={
            KMCPProfile.chunks_relative_depth: str,
        },
    )
    cls._check_num_columns(result, KMCPProfile)
    return result
Functions
kmcp_profile_standardisation_service

Provide a standardisation service for KMCP profiles.

Attributes
logger = logging.getLogger(__name__) module-attribute
Classes
KMCPProfileStandardisationService

Bases: ProfileStandardisationService

Define a standardisation service for KMCP profiles.

Source code in src/taxpasta/infrastructure/application/kmcp/kmcp_profile_standardisation_service.py
class KMCPProfileStandardisationService(ProfileStandardisationService):
    """Define a standardisation service for KMCP profiles."""

    @classmethod
    @pa.check_types(lazy=True)
    def transform(cls, profile: DataFrame[KMCPProfile]) -> DataFrame[StandardProfile]:
        """
        Tidy up and standardize a given KMCP profile.

        Args:
            profile: A taxonomic profile generated by KMCP.

        Returns:
            A standardized profile.

        """
        temp = (
            profile[[KMCPProfile.taxid, KMCPProfile.reads]]
            .copy()
            .rename(
                columns={
                    KMCPProfile.taxid: StandardProfile.taxonomy_id,
                    KMCPProfile.reads: StandardProfile.count,
                }
            )
        )
        result = temp.loc[temp[StandardProfile.taxonomy_id].notna(), :].copy()
        result[StandardProfile.taxonomy_id] = result[
            StandardProfile.taxonomy_id
        ].astype(int)
        # Replace missing values (unclassified reads) with ID zero and sum reads.
        return pd.concat(
            [
                result,
                pd.DataFrame(
                    {
                        StandardProfile.taxonomy_id: [0],
                        StandardProfile.count: [
                            temp.loc[
                                temp[StandardProfile.taxonomy_id].isna(),
                                StandardProfile.count,
                            ].sum()
                        ],
                    },
                    dtype=int,
                ),
            ],
            ignore_index=True,
        )
Functions
transform(profile: DataFrame[KMCPProfile]) -> DataFrame[StandardProfile] classmethod

Tidy up and standardize a given KMCP profile.

Parameters:

Name Type Description Default
profile DataFrame[KMCPProfile]

A taxonomic profile generated by KMCP.

required

Returns:

Type Description
DataFrame[StandardProfile]

A standardized profile.

Source code in src/taxpasta/infrastructure/application/kmcp/kmcp_profile_standardisation_service.py
@classmethod
@pa.check_types(lazy=True)
def transform(cls, profile: DataFrame[KMCPProfile]) -> DataFrame[StandardProfile]:
    """
    Tidy up and standardize a given KMCP profile.

    Args:
        profile: A taxonomic profile generated by KMCP.

    Returns:
        A standardized profile.

    """
    temp = (
        profile[[KMCPProfile.taxid, KMCPProfile.reads]]
        .copy()
        .rename(
            columns={
                KMCPProfile.taxid: StandardProfile.taxonomy_id,
                KMCPProfile.reads: StandardProfile.count,
            }
        )
    )
    result = temp.loc[temp[StandardProfile.taxonomy_id].notna(), :].copy()
    result[StandardProfile.taxonomy_id] = result[
        StandardProfile.taxonomy_id
    ].astype(int)
    # Replace missing values (unclassified reads) with ID zero and sum reads.
    return pd.concat(
        [
            result,
            pd.DataFrame(
                {
                    StandardProfile.taxonomy_id: [0],
                    StandardProfile.count: [
                        temp.loc[
                            temp[StandardProfile.taxonomy_id].isna(),
                            StandardProfile.count,
                        ].sum()
                    ],
                },
                dtype=int,
            ),
        ],
        ignore_index=True,
    )
kraken2
Classes
Modules
kraken2_profile

Provide a description of the kraken2 profile format.

Attributes
KRAKEN2_PERCENT_TOLERANCE = 1.0 module-attribute
KRAKEN2_PERCENT_TOTAL = 100.0 module-attribute
Classes
Kraken2Profile

Bases: BaseDataFrameModel

Define the expected kraken2 profile format.

Source code in src/taxpasta/infrastructure/application/kraken2/kraken2_profile.py
class Kraken2Profile(BaseDataFrameModel):
    """Define the expected kraken2 profile format."""

    percent: Series[float] = pa.Field(ge=0.0, le=100.0)
    clade_assigned_reads: Series[int] = pa.Field(ge=0)
    direct_assigned_reads: Series[int] = pa.Field(ge=0)
    num_minimizers: Optional[Series[int]] = pa.Field(ge=0)
    distinct_minimizers: Optional[Series[int]] = pa.Field(ge=0)
    taxonomy_lvl: Series[str] = pa.Field()
    taxonomy_id: Series[int] = pa.Field(ge=0)
    name: Series[str] = pa.Field()

    @pa.dataframe_check(name="compositionality", raise_warning=True)
    def check_compositionality(cls, profile: pd.DataFrame) -> bool:
        """Check that the percent of 'unclassified' and 'root' add up to a hundred."""
        # Kraken2 reports percentages only to the second decimal, so we expect
        # some deviation.
        # If 100% of reads are assigned, unclassified reads are not reported at all.
        return profile.empty or bool(
            np.isclose(
                profile.loc[
                    profile[cls.taxonomy_lvl].isin(["U", "R"]), cls.percent
                ].sum(),
                KRAKEN2_PERCENT_TOTAL,
                atol=KRAKEN2_PERCENT_TOLERANCE,
            )
        )
Attributes
clade_assigned_reads: Series[int] = pa.Field(ge=0) class-attribute instance-attribute
direct_assigned_reads: Series[int] = pa.Field(ge=0) class-attribute instance-attribute
distinct_minimizers: Optional[Series[int]] = pa.Field(ge=0) class-attribute instance-attribute
name: Series[str] = pa.Field() class-attribute instance-attribute
num_minimizers: Optional[Series[int]] = pa.Field(ge=0) class-attribute instance-attribute
percent: Series[float] = pa.Field(ge=0.0, le=100.0) class-attribute instance-attribute
taxonomy_id: Series[int] = pa.Field(ge=0) class-attribute instance-attribute
taxonomy_lvl: Series[str] = pa.Field() class-attribute instance-attribute
Functions
check_compositionality(profile: pd.DataFrame) -> bool

Check that the percent of 'unclassified' and 'root' add up to a hundred.

Source code in src/taxpasta/infrastructure/application/kraken2/kraken2_profile.py
@pa.dataframe_check(name="compositionality", raise_warning=True)
def check_compositionality(cls, profile: pd.DataFrame) -> bool:
    """Check that the percent of 'unclassified' and 'root' add up to a hundred."""
    # Kraken2 reports percentages only to the second decimal, so we expect
    # some deviation.
    # If 100% of reads are assigned, unclassified reads are not reported at all.
    return profile.empty or bool(
        np.isclose(
            profile.loc[
                profile[cls.taxonomy_lvl].isin(["U", "R"]), cls.percent
            ].sum(),
            KRAKEN2_PERCENT_TOTAL,
            atol=KRAKEN2_PERCENT_TOLERANCE,
        )
    )
kraken2_profile_reader

Provide a reader for kraken2 profiles.

Attributes Classes
Kraken2ProfileReader

Bases: ProfileReader

Define a reader for kraken2 profiles.

Source code in src/taxpasta/infrastructure/application/kraken2/kraken2_profile_reader.py
class Kraken2ProfileReader(ProfileReader):
    """Define a reader for kraken2 profiles."""

    @classmethod
    @raise_parser_warnings
    def read(cls, profile: BufferOrFilepath) -> DataFrame[Kraken2Profile]:
        """
        Read a kraken2 taxonomic profile from the given source.

        Args:
            profile: A source that contains a tab-separated taxonomic profile generated
                by kraken2.

        Returns:
            A data frame representation of the kraken2 profile.

        Raises:
            ValueError: In case the table does not contain exactly six or eight columns.

        """
        result = pd.read_table(
            filepath_or_buffer=profile,
            sep="\t",
            header=None,
            index_col=False,
            skipinitialspace=True,
        )
        if len(result.columns) == 6:
            result.columns = [
                Kraken2Profile.percent,
                Kraken2Profile.clade_assigned_reads,
                Kraken2Profile.direct_assigned_reads,
                Kraken2Profile.taxonomy_lvl,
                Kraken2Profile.taxonomy_id,
                Kraken2Profile.name,
            ]
        elif len(result.columns) == 8:
            result.columns = [
                Kraken2Profile.percent,
                Kraken2Profile.clade_assigned_reads,
                Kraken2Profile.direct_assigned_reads,
                Kraken2Profile.num_minimizers,
                Kraken2Profile.distinct_minimizers,
                Kraken2Profile.taxonomy_lvl,
                Kraken2Profile.taxonomy_id,
                Kraken2Profile.name,
            ]
        else:
            raise ValueError(
                f"Unexpected kraken2 report format. It has {len(result.columns)} "
                f"columns but only six or eight are expected."
            )
        return result
Functions
read(profile: BufferOrFilepath) -> DataFrame[Kraken2Profile] classmethod

Read a kraken2 taxonomic profile from the given source.

Parameters:

Name Type Description Default
profile BufferOrFilepath

A source that contains a tab-separated taxonomic profile generated by kraken2.

required

Returns:

Type Description
DataFrame[Kraken2Profile]

A data frame representation of the kraken2 profile.

Raises:

Type Description
ValueError

In case the table does not contain exactly six or eight columns.

Source code in src/taxpasta/infrastructure/application/kraken2/kraken2_profile_reader.py
@classmethod
@raise_parser_warnings
def read(cls, profile: BufferOrFilepath) -> DataFrame[Kraken2Profile]:
    """
    Read a kraken2 taxonomic profile from the given source.

    Args:
        profile: A source that contains a tab-separated taxonomic profile generated
            by kraken2.

    Returns:
        A data frame representation of the kraken2 profile.

    Raises:
        ValueError: In case the table does not contain exactly six or eight columns.

    """
    result = pd.read_table(
        filepath_or_buffer=profile,
        sep="\t",
        header=None,
        index_col=False,
        skipinitialspace=True,
    )
    if len(result.columns) == 6:
        result.columns = [
            Kraken2Profile.percent,
            Kraken2Profile.clade_assigned_reads,
            Kraken2Profile.direct_assigned_reads,
            Kraken2Profile.taxonomy_lvl,
            Kraken2Profile.taxonomy_id,
            Kraken2Profile.name,
        ]
    elif len(result.columns) == 8:
        result.columns = [
            Kraken2Profile.percent,
            Kraken2Profile.clade_assigned_reads,
            Kraken2Profile.direct_assigned_reads,
            Kraken2Profile.num_minimizers,
            Kraken2Profile.distinct_minimizers,
            Kraken2Profile.taxonomy_lvl,
            Kraken2Profile.taxonomy_id,
            Kraken2Profile.name,
        ]
    else:
        raise ValueError(
            f"Unexpected kraken2 report format. It has {len(result.columns)} "
            f"columns but only six or eight are expected."
        )
    return result
Functions
kraken2_profile_standardisation_service

Provide a standardisation service for kraken2 profiles.

Classes
Kraken2ProfileStandardisationService

Bases: ProfileStandardisationService

Define a standardisation service for kraken2 profiles.

Source code in src/taxpasta/infrastructure/application/kraken2/kraken2_profile_standardisation_service.py
class Kraken2ProfileStandardisationService(ProfileStandardisationService):
    """Define a standardisation service for kraken2 profiles."""

    @classmethod
    @pa.check_types(lazy=True)
    def transform(
        cls, profile: DataFrame[Kraken2Profile]
    ) -> DataFrame[StandardProfile]:
        """
        Tidy up and standardize a given kraken2 profile.

        Args:
            profile: A taxonomic profile generated by kraken2.

        Returns:
            A standardized profile.

        """
        return (
            profile[[Kraken2Profile.taxonomy_id, Kraken2Profile.direct_assigned_reads]]
            .copy()
            .rename(
                columns={
                    Kraken2Profile.taxonomy_id: StandardProfile.taxonomy_id,
                    Kraken2Profile.direct_assigned_reads: StandardProfile.count,
                }
            )
        )
Functions
transform(profile: DataFrame[Kraken2Profile]) -> DataFrame[StandardProfile] classmethod

Tidy up and standardize a given kraken2 profile.

Parameters:

Name Type Description Default
profile DataFrame[Kraken2Profile]

A taxonomic profile generated by kraken2.

required

Returns:

Type Description
DataFrame[StandardProfile]

A standardized profile.

Source code in src/taxpasta/infrastructure/application/kraken2/kraken2_profile_standardisation_service.py
@classmethod
@pa.check_types(lazy=True)
def transform(
    cls, profile: DataFrame[Kraken2Profile]
) -> DataFrame[StandardProfile]:
    """
    Tidy up and standardize a given kraken2 profile.

    Args:
        profile: A taxonomic profile generated by kraken2.

    Returns:
        A standardized profile.

    """
    return (
        profile[[Kraken2Profile.taxonomy_id, Kraken2Profile.direct_assigned_reads]]
        .copy()
        .rename(
            columns={
                Kraken2Profile.taxonomy_id: StandardProfile.taxonomy_id,
                Kraken2Profile.direct_assigned_reads: StandardProfile.count,
            }
        )
    )
krakenuniq
Classes
Modules
krakenuniq_profile

Provide a description of the KrakenUniq profile format.

Classes
KrakenUniqProfile

Bases: BaseDataFrameModel

Define the expected KrakenUniq profile format.

Source code in src/taxpasta/infrastructure/application/krakenuniq/krakenuniq_profile.py
class KrakenUniqProfile(BaseDataFrameModel):
    """Define the expected KrakenUniq profile format."""

    percent: Series[float] = pa.Field(ge=0.0, le=100.0, alias="%")
    reads: Series[int] = pa.Field(ge=0)
    tax_reads: Series[int] = pa.Field(ge=0, alias="taxReads")
    kmers: Series[int] = pa.Field(ge=0)
    duplicates: Series[float] = pa.Field(ge=0.0, alias="dup")
    coverage: Series[float] = pa.Field(ge=0.0, nullable=True, alias="cov")
    tax_id: Series[int] = pa.Field(alias="taxID", ge=0)
    rank: Series[str] = pa.Field()
    tax_name: Series[str] = pa.Field(alias="taxName")
Attributes
coverage: Series[float] = pa.Field(ge=0.0, nullable=True, alias='cov') class-attribute instance-attribute
duplicates: Series[float] = pa.Field(ge=0.0, alias='dup') class-attribute instance-attribute
kmers: Series[int] = pa.Field(ge=0) class-attribute instance-attribute
percent: Series[float] = pa.Field(ge=0.0, le=100.0, alias='%') class-attribute instance-attribute
rank: Series[str] = pa.Field() class-attribute instance-attribute
reads: Series[int] = pa.Field(ge=0) class-attribute instance-attribute
tax_id: Series[int] = pa.Field(alias='taxID', ge=0) class-attribute instance-attribute
tax_name: Series[str] = pa.Field(alias='taxName') class-attribute instance-attribute
tax_reads: Series[int] = pa.Field(ge=0, alias='taxReads') class-attribute instance-attribute
krakenuniq_profile_reader

Provide a reader for KrakenUniq profiles.

Attributes Classes
KrakenUniqProfileReader

Bases: ProfileReader

Define a reader for KrakenUniq profiles.

Source code in src/taxpasta/infrastructure/application/krakenuniq/krakenuniq_profile_reader.py
class KrakenUniqProfileReader(ProfileReader):
    """Define a reader for KrakenUniq profiles."""

    @classmethod
    @raise_parser_warnings
    def read(cls, profile: BufferOrFilepath) -> DataFrame[KrakenUniqProfile]:
        """
        Read a krakenUniq taxonomic profile from the given source.

        Args:
            profile: A source that contains a tab-separated taxonomic profile generated
                by KrakenUniq.

        Returns:
            A data frame representation of the KrakenUniq profile.

        """
        result = pd.read_table(
            filepath_or_buffer=profile,
            sep="\t",
            skiprows=2,
            header=0,
            index_col=False,
            skipinitialspace=True,
            dtype={
                KrakenUniqProfile.percent: float,
                KrakenUniqProfile.duplicates: float,
                KrakenUniqProfile.coverage: float,
            },
        )
        cls._check_num_columns(result, KrakenUniqProfile)
        return result
Functions
read(profile: BufferOrFilepath) -> DataFrame[KrakenUniqProfile] classmethod

Read a krakenUniq taxonomic profile from the given source.

Parameters:

Name Type Description Default
profile BufferOrFilepath

A source that contains a tab-separated taxonomic profile generated by KrakenUniq.

required

Returns:

Type Description
DataFrame[KrakenUniqProfile]

A data frame representation of the KrakenUniq profile.

Source code in src/taxpasta/infrastructure/application/krakenuniq/krakenuniq_profile_reader.py
@classmethod
@raise_parser_warnings
def read(cls, profile: BufferOrFilepath) -> DataFrame[KrakenUniqProfile]:
    """
    Read a krakenUniq taxonomic profile from the given source.

    Args:
        profile: A source that contains a tab-separated taxonomic profile generated
            by KrakenUniq.

    Returns:
        A data frame representation of the KrakenUniq profile.

    """
    result = pd.read_table(
        filepath_or_buffer=profile,
        sep="\t",
        skiprows=2,
        header=0,
        index_col=False,
        skipinitialspace=True,
        dtype={
            KrakenUniqProfile.percent: float,
            KrakenUniqProfile.duplicates: float,
            KrakenUniqProfile.coverage: float,
        },
    )
    cls._check_num_columns(result, KrakenUniqProfile)
    return result
Functions
krakenuniq_profile_standardisation_service

Provide a standardisation service for KrakenUniq profiles.

Classes
KrakenUniqProfileStandardisationService

Bases: ProfileStandardisationService

Define a standardisation service for krakenUniq profiles.

Source code in src/taxpasta/infrastructure/application/krakenuniq/krakenuniq_profile_standardisation_service.py
class KrakenUniqProfileStandardisationService(ProfileStandardisationService):
    """Define a standardisation service for krakenUniq profiles."""

    @classmethod
    @pa.check_types(lazy=True)
    def transform(
        cls, profile: DataFrame[KrakenUniqProfile]
    ) -> DataFrame[StandardProfile]:
        """
        Tidy up and standardize a given krakenUniq profile.

        Args:
            profile: A taxonomic profile generated by KrakenUniq.

        Returns:
            A standardized profile.

        """
        return (
            profile[[KrakenUniqProfile.tax_id, KrakenUniqProfile.tax_reads]]
            .copy()
            .rename(
                columns={
                    KrakenUniqProfile.tax_id: StandardProfile.taxonomy_id,
                    KrakenUniqProfile.tax_reads: StandardProfile.count,
                }
            )
        )
Functions
transform(profile: DataFrame[KrakenUniqProfile]) -> DataFrame[StandardProfile] classmethod

Tidy up and standardize a given krakenUniq profile.

Parameters:

Name Type Description Default
profile DataFrame[KrakenUniqProfile]

A taxonomic profile generated by KrakenUniq.

required

Returns:

Type Description
DataFrame[StandardProfile]

A standardized profile.

Source code in src/taxpasta/infrastructure/application/krakenuniq/krakenuniq_profile_standardisation_service.py
@classmethod
@pa.check_types(lazy=True)
def transform(
    cls, profile: DataFrame[KrakenUniqProfile]
) -> DataFrame[StandardProfile]:
    """
    Tidy up and standardize a given krakenUniq profile.

    Args:
        profile: A taxonomic profile generated by KrakenUniq.

    Returns:
        A standardized profile.

    """
    return (
        profile[[KrakenUniqProfile.tax_id, KrakenUniqProfile.tax_reads]]
        .copy()
        .rename(
            columns={
                KrakenUniqProfile.tax_id: StandardProfile.taxonomy_id,
                KrakenUniqProfile.tax_reads: StandardProfile.count,
            }
        )
    )
megan6
Classes
Modules
megan6_profile

Provide a description of the MEGAN6 rma2info profile format.

Classes
Megan6Profile

Bases: BaseDataFrameModel

Define the expected MEGAN6 rma2info profile format.

Source code in src/taxpasta/infrastructure/application/megan6/megan6_profile.py
class Megan6Profile(BaseDataFrameModel):
    """Define the expected MEGAN6 rma2info profile format."""

    taxonomy_id: Series[int] = pa.Field(ge=0)
    count: Series[float] = pa.Field(ge=0.0)
Attributes
count: Series[float] = pa.Field(ge=0.0) class-attribute instance-attribute
taxonomy_id: Series[int] = pa.Field(ge=0) class-attribute instance-attribute
megan6_profile_reader

Provide a reader for megan6 profiles.

Attributes Classes
Megan6ProfileReader

Bases: ProfileReader

Define a reader for MEGAN6 rma2info profiles.

Source code in src/taxpasta/infrastructure/application/megan6/megan6_profile_reader.py
class Megan6ProfileReader(ProfileReader):
    """Define a reader for MEGAN6 rma2info profiles."""

    @classmethod
    @raise_parser_warnings
    def read(cls, profile: BufferOrFilepath) -> DataFrame[Megan6Profile]:
        """Read a MEGAN6 rma2info taxonomic profile from a file."""
        result = pd.read_table(
            filepath_or_buffer=profile,
            sep="\t",
            index_col=False,
            header=None,
        )
        cls._check_num_columns(result, Megan6Profile)
        result.columns = [Megan6Profile.taxonomy_id, Megan6Profile.count]
        return result
Functions
read(profile: BufferOrFilepath) -> DataFrame[Megan6Profile] classmethod

Read a MEGAN6 rma2info taxonomic profile from a file.

Source code in src/taxpasta/infrastructure/application/megan6/megan6_profile_reader.py
@classmethod
@raise_parser_warnings
def read(cls, profile: BufferOrFilepath) -> DataFrame[Megan6Profile]:
    """Read a MEGAN6 rma2info taxonomic profile from a file."""
    result = pd.read_table(
        filepath_or_buffer=profile,
        sep="\t",
        index_col=False,
        header=None,
    )
    cls._check_num_columns(result, Megan6Profile)
    result.columns = [Megan6Profile.taxonomy_id, Megan6Profile.count]
    return result
Functions
megan6_profile_standardisation_service

Provide a standardisation service for megan6 profiles.

Classes
Megan6ProfileStandardisationService

Bases: ProfileStandardisationService

Define a standardisation service for megan6 profiles.

Source code in src/taxpasta/infrastructure/application/megan6/megan6_profile_standardisation_service.py
class Megan6ProfileStandardisationService(ProfileStandardisationService):
    """Define a standardisation service for megan6 profiles."""

    @classmethod
    @pa.check_types(lazy=True)
    def transform(cls, profile: DataFrame[Megan6Profile]) -> DataFrame[StandardProfile]:
        """
        Tidy up and standardize a given MEGAN6 rma2info profile.

        Args:
            profile: A taxonomic profile generated by MEGAN6 rma2info.

        Returns:
            A standardized profile.

        """
        return (
            profile[[Megan6Profile.taxonomy_id, Megan6Profile.count]]
            .copy()
            .rename(
                columns={
                    Megan6Profile.taxonomy_id: StandardProfile.taxonomy_id,
                    Megan6Profile.count: StandardProfile.count,
                }
            )
        )
Functions
transform(profile: DataFrame[Megan6Profile]) -> DataFrame[StandardProfile] classmethod

Tidy up and standardize a given MEGAN6 rma2info profile.

Parameters:

Name Type Description Default
profile DataFrame[Megan6Profile]

A taxonomic profile generated by MEGAN6 rma2info.

required

Returns:

Type Description
DataFrame[StandardProfile]

A standardized profile.

Source code in src/taxpasta/infrastructure/application/megan6/megan6_profile_standardisation_service.py
@classmethod
@pa.check_types(lazy=True)
def transform(cls, profile: DataFrame[Megan6Profile]) -> DataFrame[StandardProfile]:
    """
    Tidy up and standardize a given MEGAN6 rma2info profile.

    Args:
        profile: A taxonomic profile generated by MEGAN6 rma2info.

    Returns:
        A standardized profile.

    """
    return (
        profile[[Megan6Profile.taxonomy_id, Megan6Profile.count]]
        .copy()
        .rename(
            columns={
                Megan6Profile.taxonomy_id: StandardProfile.taxonomy_id,
                Megan6Profile.count: StandardProfile.count,
            }
        )
    )
metaphlan
Classes
Modules
metaphlan_profile

Provide a description of the metaphlan profile format.

Attributes
METAPHLAN_PERCENT_TOLERANCE = 1.0 module-attribute
METAPHLAN_PERCENT_TOTAL = 100.0 module-attribute
Classes
MetaphlanProfile

Bases: BaseDataFrameModel

Define the expected metaphlan profile format.

Source code in src/taxpasta/infrastructure/application/metaphlan/metaphlan_profile.py
class MetaphlanProfile(BaseDataFrameModel):
    """Define the expected metaphlan profile format."""

    clade_name: Series[str] = pa.Field()
    # MetaPhlan provides the full lineage of tax IDs in this field.
    ncbi_tax_id: Series[str] = pa.Field(alias="NCBI_tax_id")
    relative_abundance: Series[float] = pa.Field(ge=0.0, le=100.0)
    additional_species: Optional[Series[str]] = pa.Field(nullable=True)

    @pa.dataframe_check(name="compositionality", raise_warning=True)
    def check_compositionality(cls, profile: pd.DataFrame) -> bool:
        """Check that the percentages per rank add up to a hundred."""
        # Parse the rank from the given lineage.
        rank = profile[cls.clade_name].str.rsplit("|", n=1).str[-1].str[0]
        return profile.empty or bool(
            np.allclose(
                profile.groupby(rank, sort=False)[cls.relative_abundance].sum(),
                METAPHLAN_PERCENT_TOTAL,
                atol=METAPHLAN_PERCENT_TOLERANCE,
            )
        )
Attributes
additional_species: Optional[Series[str]] = pa.Field(nullable=True) class-attribute instance-attribute
clade_name: Series[str] = pa.Field() class-attribute instance-attribute
ncbi_tax_id: Series[str] = pa.Field(alias='NCBI_tax_id') class-attribute instance-attribute
relative_abundance: Series[float] = pa.Field(ge=0.0, le=100.0) class-attribute instance-attribute
Functions
check_compositionality(profile: pd.DataFrame) -> bool

Check that the percentages per rank add up to a hundred.

Source code in src/taxpasta/infrastructure/application/metaphlan/metaphlan_profile.py
@pa.dataframe_check(name="compositionality", raise_warning=True)
def check_compositionality(cls, profile: pd.DataFrame) -> bool:
    """Check that the percentages per rank add up to a hundred."""
    # Parse the rank from the given lineage.
    rank = profile[cls.clade_name].str.rsplit("|", n=1).str[-1].str[0]
    return profile.empty or bool(
        np.allclose(
            profile.groupby(rank, sort=False)[cls.relative_abundance].sum(),
            METAPHLAN_PERCENT_TOTAL,
            atol=METAPHLAN_PERCENT_TOLERANCE,
        )
    )
metaphlan_profile_reader

Provide a reader for metaphlan profiles.

Attributes Classes
MetaphlanProfileReader

Bases: ProfileReader

Define a reader for Metaphlan profiles.

Source code in src/taxpasta/infrastructure/application/metaphlan/metaphlan_profile_reader.py
class MetaphlanProfileReader(ProfileReader):
    """Define a reader for Metaphlan profiles."""

    @classmethod
    @raise_parser_warnings
    def read(cls, profile: BufferOrFilepath) -> DataFrame[MetaphlanProfile]:
        """Read a metaphlan taxonomic profile from a file."""
        num_header_lines = cls._detect_number_header_line(profile)
        result = pd.read_table(
            filepath_or_buffer=profile,
            sep="\t",
            skiprows=num_header_lines,
            header=None,
            index_col=False,
            dtype={1: str},
        )
        cls._check_num_columns(result, MetaphlanProfile)
        result.columns = [
            MetaphlanProfile.clade_name,
            MetaphlanProfile.ncbi_tax_id,
            MetaphlanProfile.relative_abundance,
            MetaphlanProfile.additional_species,
        ]
        return result

    @classmethod
    def _detect_number_header_line(cls, profile: BufferOrFilepath) -> int:
        """
        Detect the number of comment lines in the header of a MetaPhlAn profile.

        The number of lines varies at least between versions 3 & 4.

        """
        if isinstance(profile, BinaryIO):
            # We assume default file encoding here (UTF-8 in most environments).
            result = cls._detect_first_content_line(buffer=TextIOWrapper(profile))
            profile.seek(0)
            return result
        elif isinstance(profile, TextIO):
            result = cls._detect_first_content_line(buffer=profile)
            profile.seek(0)
            return result
        else:
            with Path(profile).open(mode="r") as handle:
                return cls._detect_first_content_line(buffer=handle)

    @classmethod
    def _detect_first_content_line(
        cls, buffer: TextIO, comment_marker: str = "#", max_lines: int = 10
    ) -> int:
        """Detect the first non-comment line in the given text buffer."""
        for num, line in enumerate(buffer):
            if not line.startswith(comment_marker):
                return num
            if num >= max_lines:
                raise ValueError(
                    "Unexpectedly large number of comment lines in MetaPhlAn "
                    "profile (>10)."
                )
        else:
            raise ValueError("Could not detect any content lines in MetaPhlAn profile.")
Functions
read(profile: BufferOrFilepath) -> DataFrame[MetaphlanProfile] classmethod

Read a metaphlan taxonomic profile from a file.

Source code in src/taxpasta/infrastructure/application/metaphlan/metaphlan_profile_reader.py
@classmethod
@raise_parser_warnings
def read(cls, profile: BufferOrFilepath) -> DataFrame[MetaphlanProfile]:
    """Read a metaphlan taxonomic profile from a file."""
    num_header_lines = cls._detect_number_header_line(profile)
    result = pd.read_table(
        filepath_or_buffer=profile,
        sep="\t",
        skiprows=num_header_lines,
        header=None,
        index_col=False,
        dtype={1: str},
    )
    cls._check_num_columns(result, MetaphlanProfile)
    result.columns = [
        MetaphlanProfile.clade_name,
        MetaphlanProfile.ncbi_tax_id,
        MetaphlanProfile.relative_abundance,
        MetaphlanProfile.additional_species,
    ]
    return result
Functions
metaphlan_profile_standardisation_service

Provide a standardisation service for metaphlan profiles.

Attributes
logger = logging.getLogger(__name__) module-attribute
Classes
MetaphlanProfileStandardisationService

Bases: ProfileStandardisationService

Define a standardisation service for metaphlan profiles.

Source code in src/taxpasta/infrastructure/application/metaphlan/metaphlan_profile_standardisation_service.py
class MetaphlanProfileStandardisationService(ProfileStandardisationService):
    """Define a standardisation service for metaphlan profiles."""

    # Metaphlan only reports up to six decimals so this number should be large enough.
    LARGE_INTEGER = 1_000_000

    @classmethod
    @pa.check_types(lazy=True)
    def transform(
        cls, profile: DataFrame[MetaphlanProfile]
    ) -> DataFrame[StandardProfile]:
        """
        Tidy up and standardize a given metaphlan profile.

        Args:
            profile: A taxonomic profile generated by metaphlan.

        Returns:
            A standardized profile.

        """
        result = (
            profile[[MetaphlanProfile.ncbi_tax_id, MetaphlanProfile.relative_abundance]]
            .copy()
            .rename(
                columns={
                    MetaphlanProfile.ncbi_tax_id: StandardProfile.taxonomy_id,
                    MetaphlanProfile.relative_abundance: StandardProfile.count,
                }
            )
            .assign(
                **{
                    StandardProfile.taxonomy_id: lambda df: df[
                        StandardProfile.taxonomy_id
                    ]
                    .str.rsplit("|", n=1)
                    .str[-1],
                    StandardProfile.count: lambda df: df[StandardProfile.count]
                    * cls.LARGE_INTEGER,
                }
            )
            .assign(
                **{
                    StandardProfile.count: lambda df: df[StandardProfile.count].astype(
                        int
                    )
                }
            )
        )
        result[StandardProfile.taxonomy_id] = pd.to_numeric(
            result[StandardProfile.taxonomy_id], errors="coerce"
        ).astype("Int64")
        unclassified_mask = result[StandardProfile.taxonomy_id].isna() | (
            result[StandardProfile.taxonomy_id] == -1
        )
        num = int(unclassified_mask.sum())
        if num > 0:
            logger.warning(
                "Combining %d entries with unclassified taxa in the profile.", num
            )
        return pd.concat(
            [
                result.loc[~unclassified_mask, :],
                pd.DataFrame(
                    {
                        StandardProfile.taxonomy_id: [0],
                        StandardProfile.count: [
                            result.loc[unclassified_mask, StandardProfile.count].sum()
                        ],
                    },
                    dtype=int,
                ),
            ],
            ignore_index=True,
        )
Attributes
LARGE_INTEGER = 1000000 class-attribute instance-attribute
Functions
transform(profile: DataFrame[MetaphlanProfile]) -> DataFrame[StandardProfile] classmethod

Tidy up and standardize a given metaphlan profile.

Parameters:

Name Type Description Default
profile DataFrame[MetaphlanProfile]

A taxonomic profile generated by metaphlan.

required

Returns:

Type Description
DataFrame[StandardProfile]

A standardized profile.

Source code in src/taxpasta/infrastructure/application/metaphlan/metaphlan_profile_standardisation_service.py
@classmethod
@pa.check_types(lazy=True)
def transform(
    cls, profile: DataFrame[MetaphlanProfile]
) -> DataFrame[StandardProfile]:
    """
    Tidy up and standardize a given metaphlan profile.

    Args:
        profile: A taxonomic profile generated by metaphlan.

    Returns:
        A standardized profile.

    """
    result = (
        profile[[MetaphlanProfile.ncbi_tax_id, MetaphlanProfile.relative_abundance]]
        .copy()
        .rename(
            columns={
                MetaphlanProfile.ncbi_tax_id: StandardProfile.taxonomy_id,
                MetaphlanProfile.relative_abundance: StandardProfile.count,
            }
        )
        .assign(
            **{
                StandardProfile.taxonomy_id: lambda df: df[
                    StandardProfile.taxonomy_id
                ]
                .str.rsplit("|", n=1)
                .str[-1],
                StandardProfile.count: lambda df: df[StandardProfile.count]
                * cls.LARGE_INTEGER,
            }
        )
        .assign(
            **{
                StandardProfile.count: lambda df: df[StandardProfile.count].astype(
                    int
                )
            }
        )
    )
    result[StandardProfile.taxonomy_id] = pd.to_numeric(
        result[StandardProfile.taxonomy_id], errors="coerce"
    ).astype("Int64")
    unclassified_mask = result[StandardProfile.taxonomy_id].isna() | (
        result[StandardProfile.taxonomy_id] == -1
    )
    num = int(unclassified_mask.sum())
    if num > 0:
        logger.warning(
            "Combining %d entries with unclassified taxa in the profile.", num
        )
    return pd.concat(
        [
            result.loc[~unclassified_mask, :],
            pd.DataFrame(
                {
                    StandardProfile.taxonomy_id: [0],
                    StandardProfile.count: [
                        result.loc[unclassified_mask, StandardProfile.count].sum()
                    ],
                },
                dtype=int,
            ),
        ],
        ignore_index=True,
    )
motus
Classes
Modules
motus_profile

Provide a description of the mOTUs profile format.

Classes
MotusProfile

Bases: BaseDataFrameModel

Define the expected mOTUs profile format.

Source code in src/taxpasta/infrastructure/application/motus/motus_profile.py
class MotusProfile(BaseDataFrameModel):
    """Define the expected mOTUs profile format."""

    consensus_taxonomy: Series[str] = pa.Field()
    ncbi_tax_id: Series[pd.Int64Dtype] = pa.Field(nullable=True)
    read_count: Series[int] = pa.Field(ge=0)
Attributes
consensus_taxonomy: Series[str] = pa.Field() class-attribute instance-attribute
ncbi_tax_id: Series[pd.Int64Dtype] = pa.Field(nullable=True) class-attribute instance-attribute
read_count: Series[int] = pa.Field(ge=0) class-attribute instance-attribute
motus_profile_reader

Provide a reader for motus profiles.

Attributes Classes
MotusProfileReader

Bases: ProfileReader

Define a reader for mOTUS profiles.

Source code in src/taxpasta/infrastructure/application/motus/motus_profile_reader.py
class MotusProfileReader(ProfileReader):
    """Define a reader for mOTUS profiles."""

    @classmethod
    @raise_parser_warnings
    def read(cls, profile: BufferOrFilepath) -> DataFrame[MotusProfile]:
        """Read a mOTUs taxonomic profile from a file."""
        result = pd.read_table(
            filepath_or_buffer=profile,
            sep="\t",
            skiprows=3,
            header=None,
            index_col=False,
            dtype={1: "Int64"},
        )
        cls._check_num_columns(result, MotusProfile)
        result.columns = [
            MotusProfile.consensus_taxonomy,
            MotusProfile.ncbi_tax_id,
            MotusProfile.read_count,
        ]
        return result
Functions
read(profile: BufferOrFilepath) -> DataFrame[MotusProfile] classmethod

Read a mOTUs taxonomic profile from a file.

Source code in src/taxpasta/infrastructure/application/motus/motus_profile_reader.py
@classmethod
@raise_parser_warnings
def read(cls, profile: BufferOrFilepath) -> DataFrame[MotusProfile]:
    """Read a mOTUs taxonomic profile from a file."""
    result = pd.read_table(
        filepath_or_buffer=profile,
        sep="\t",
        skiprows=3,
        header=None,
        index_col=False,
        dtype={1: "Int64"},
    )
    cls._check_num_columns(result, MotusProfile)
    result.columns = [
        MotusProfile.consensus_taxonomy,
        MotusProfile.ncbi_tax_id,
        MotusProfile.read_count,
    ]
    return result
Functions
motus_profile_standardisation_service

Provide a standardisation service for mOTUs profiles.

Classes
MotusProfileStandardisationService

Bases: ProfileStandardisationService

Define a standardisation service for mOTUs profiles.

Source code in src/taxpasta/infrastructure/application/motus/motus_profile_standardisation_service.py
class MotusProfileStandardisationService(ProfileStandardisationService):
    """Define a standardisation service for mOTUs profiles."""

    @classmethod
    @pa.check_types(lazy=True)
    def transform(cls, profile: DataFrame[MotusProfile]) -> DataFrame[StandardProfile]:
        """
        Tidy up and standardize a given mOTUs profile.

        Args:
            profile: A taxonomic profile generated by mOTUs.

        Returns:
            A standardized profile.

        """
        temp = (
            profile.loc[
                # Ignore entries with zero read count.
                profile[MotusProfile.read_count] > 0,
                [MotusProfile.ncbi_tax_id, MotusProfile.read_count],
            ]
            .copy()
            .rename(
                columns={
                    MotusProfile.ncbi_tax_id: StandardProfile.taxonomy_id,
                    MotusProfile.read_count: StandardProfile.count,
                }
            )
        )
        # Split profile into entries with known and unknown tax ID.
        result = (
            temp.loc[temp[StandardProfile.taxonomy_id].notna(), :]
            .copy()
            .assign(
                **{
                    StandardProfile.taxonomy_id: lambda df: df[
                        StandardProfile.taxonomy_id
                    ].astype(int)
                }
            )
            # FIXME (Moritz): Apparently, mOTUs profiles can contain duplicate tax IDs.
            #  Clarify with Sofia and Maxime. For now, sum up read counts.
            #  https://github.com/taxprofiler/taxpasta/issues/46
            .groupby(StandardProfile.taxonomy_id, as_index=False, sort=False)
            .sum()
        )
        # Sum up all remaining read counts without tax ID to be 'unassigned'.
        return pd.concat(
            [
                result,
                pd.DataFrame(
                    {
                        StandardProfile.taxonomy_id: [0],
                        StandardProfile.count: [
                            temp.loc[
                                temp[StandardProfile.taxonomy_id].isna(),
                                StandardProfile.count,
                            ].sum()
                        ],
                    },
                    dtype=int,
                ),
            ],
            ignore_index=True,
        )
Functions
transform(profile: DataFrame[MotusProfile]) -> DataFrame[StandardProfile] classmethod

Tidy up and standardize a given mOTUs profile.

Parameters:

Name Type Description Default
profile DataFrame[MotusProfile]

A taxonomic profile generated by mOTUs.

required

Returns:

Type Description
DataFrame[StandardProfile]

A standardized profile.

Source code in src/taxpasta/infrastructure/application/motus/motus_profile_standardisation_service.py
@classmethod
@pa.check_types(lazy=True)
def transform(cls, profile: DataFrame[MotusProfile]) -> DataFrame[StandardProfile]:
    """
    Tidy up and standardize a given mOTUs profile.

    Args:
        profile: A taxonomic profile generated by mOTUs.

    Returns:
        A standardized profile.

    """
    temp = (
        profile.loc[
            # Ignore entries with zero read count.
            profile[MotusProfile.read_count] > 0,
            [MotusProfile.ncbi_tax_id, MotusProfile.read_count],
        ]
        .copy()
        .rename(
            columns={
                MotusProfile.ncbi_tax_id: StandardProfile.taxonomy_id,
                MotusProfile.read_count: StandardProfile.count,
            }
        )
    )
    # Split profile into entries with known and unknown tax ID.
    result = (
        temp.loc[temp[StandardProfile.taxonomy_id].notna(), :]
        .copy()
        .assign(
            **{
                StandardProfile.taxonomy_id: lambda df: df[
                    StandardProfile.taxonomy_id
                ].astype(int)
            }
        )
        # FIXME (Moritz): Apparently, mOTUs profiles can contain duplicate tax IDs.
        #  Clarify with Sofia and Maxime. For now, sum up read counts.
        #  https://github.com/taxprofiler/taxpasta/issues/46
        .groupby(StandardProfile.taxonomy_id, as_index=False, sort=False)
        .sum()
    )
    # Sum up all remaining read counts without tax ID to be 'unassigned'.
    return pd.concat(
        [
            result,
            pd.DataFrame(
                {
                    StandardProfile.taxonomy_id: [0],
                    StandardProfile.count: [
                        temp.loc[
                            temp[StandardProfile.taxonomy_id].isna(),
                            StandardProfile.count,
                        ].sum()
                    ],
                },
                dtype=int,
            ),
        ],
        ignore_index=True,
    )
sample_sheet

Provide a description of samples and profile locations.

Classes
SampleSheet

Bases: DataFrameModel

Define a description of samples and profile locations.

Source code in src/taxpasta/infrastructure/application/sample_sheet.py
class SampleSheet(pa.DataFrameModel):
    """Define a description of samples and profile locations."""

    sample: Series[str] = pa.Field()
    profile: Series[str] = pa.Field()  # type: ignore

    @pa.dataframe_check
    @classmethod
    def check_number_samples(cls, table: DataFrame) -> bool:
        """Check that there are at least two samples."""
        return (table[cls.sample].notnull() & table[cls.profile].notnull()).sum() > 1

    @pa.check("profile", name="profile_presence")
    @classmethod
    def check_profile_presence(
        cls, profile: Series[str]  # type: ignore
    ) -> Series[bool]:
        """Check that every profile is present at the specified location."""
        return cast(Series[bool], profile.map(lambda path: Path(path).is_file()))

    class Config:
        """Configure the schema model."""

        coerce = True
        ordered = True
        strict = True
Attributes
profile: Series[str] = pa.Field() class-attribute instance-attribute
sample: Series[str] = pa.Field() class-attribute instance-attribute
Classes
Config

Configure the schema model.

Source code in src/taxpasta/infrastructure/application/sample_sheet.py
class Config:
    """Configure the schema model."""

    coerce = True
    ordered = True
    strict = True
Attributes
coerce = True class-attribute instance-attribute
ordered = True class-attribute instance-attribute
strict = True class-attribute instance-attribute
Functions
check_number_samples(table: DataFrame) -> bool classmethod

Check that there are at least two samples.

Source code in src/taxpasta/infrastructure/application/sample_sheet.py
@pa.dataframe_check
@classmethod
def check_number_samples(cls, table: DataFrame) -> bool:
    """Check that there are at least two samples."""
    return (table[cls.sample].notnull() & table[cls.profile].notnull()).sum() > 1
check_profile_presence(profile: Series[str]) -> Series[bool] classmethod

Check that every profile is present at the specified location.

Source code in src/taxpasta/infrastructure/application/sample_sheet.py
@pa.check("profile", name="profile_presence")
@classmethod
def check_profile_presence(
    cls, profile: Series[str]  # type: ignore
) -> Series[bool]:
    """Check that every profile is present at the specified location."""
    return cast(Series[bool], profile.map(lambda path: Path(path).is_file()))
standard_profile_file_format

Provide a service for supported tabular file formats.

Classes
StandardProfileFileFormat

Bases: str, DependencyCheckMixin, Enum

Define the supported standardized profile file formats.

Source code in src/taxpasta/infrastructure/application/standard_profile_file_format.py
@unique
class StandardProfileFileFormat(str,