Skip to content

Modules

application

Classes

Modules

application_service_registry

Provide an application service registry.

Classes
ApplicationServiceRegistry

Define an application service registry.

Source code in src/taxpasta/infrastructure/application/application_service_registry.py
class ApplicationServiceRegistry:
    """Define an application service registry."""

    @classmethod
    def profile_reader(cls, profiler: SupportedProfiler) -> Type[ProfileReader]:
        """Return a profile reader of the correct type."""
        if profiler is SupportedProfiler.bracken:
            from .bracken import BrackenProfileReader

            return BrackenProfileReader
        elif profiler is SupportedProfiler.centrifuge:
            from .centrifuge import CentrifugeProfileReader

            return CentrifugeProfileReader
        elif profiler is SupportedProfiler.diamond:
            from .diamond import DiamondProfileReader

            return DiamondProfileReader
        elif profiler is SupportedProfiler.ganon:
            from .ganon import GanonProfileReader

            return GanonProfileReader
        elif profiler is SupportedProfiler.kaiju:
            from .kaiju import KaijuProfileReader

            return KaijuProfileReader
        elif profiler is SupportedProfiler.kmcp:
            from .kmcp import KMCPProfileReader

            return KMCPProfileReader
        elif profiler is SupportedProfiler.kraken2:
            from .kraken2 import Kraken2ProfileReader

            return Kraken2ProfileReader
        elif profiler is SupportedProfiler.krakenuniq:
            from .krakenuniq import KrakenUniqProfileReader

            return KrakenUniqProfileReader
        elif profiler is SupportedProfiler.megan6:
            from .megan6 import Megan6ProfileReader

            return Megan6ProfileReader
        elif profiler is SupportedProfiler.metaphlan:
            from .metaphlan import MetaphlanProfileReader

            return MetaphlanProfileReader
        elif profiler is SupportedProfiler.motus:
            from .motus import MotusProfileReader

            return MotusProfileReader

    @classmethod
    def profile_standardisation_service(
        cls, profiler: SupportedProfiler
    ) -> Type[ProfileStandardisationService]:
        """Return a profile standardisation service of the correct type."""
        if profiler is SupportedProfiler.bracken:
            from .bracken import BrackenProfileStandardisationService

            return BrackenProfileStandardisationService
        elif profiler is SupportedProfiler.centrifuge:
            from .centrifuge import CentrifugeProfileStandardisationService

            return CentrifugeProfileStandardisationService
        elif profiler is SupportedProfiler.diamond:
            from .diamond import DiamondProfileStandardisationService

            return DiamondProfileStandardisationService
        elif profiler is SupportedProfiler.kaiju:
            from .kaiju import KaijuProfileStandardisationService

            return KaijuProfileStandardisationService
        elif profiler is SupportedProfiler.kraken2:
            from .kraken2 import Kraken2ProfileStandardisationService

            return Kraken2ProfileStandardisationService
        elif profiler is SupportedProfiler.krakenuniq:
            from .krakenuniq import KrakenUniqProfileStandardisationService

            return KrakenUniqProfileStandardisationService
        elif profiler is SupportedProfiler.megan6:
            from .megan6 import Megan6ProfileStandardisationService

            return Megan6ProfileStandardisationService
        elif profiler is SupportedProfiler.motus:
            from .motus import MotusProfileStandardisationService

            return MotusProfileStandardisationService
        elif profiler is SupportedProfiler.metaphlan:
            from .metaphlan import MetaphlanProfileStandardisationService

            return MetaphlanProfileStandardisationService
        elif profiler is SupportedProfiler.ganon:
            from .ganon import GanonProfileStandardisationService

            return GanonProfileStandardisationService
        elif profiler is SupportedProfiler.kmcp:
            from .kmcp import KMCPProfileStandardisationService

            return KMCPProfileStandardisationService

        else:
            raise ValueError("Unexpected")

    @classmethod
    def standard_profile_writer(
        cls, file_format: StandardProfileFileFormat
    ) -> Type[StandardProfileWriter]:
        """Return a standard profile writer of the correct type."""
        if file_format is StandardProfileFileFormat.TSV:
            from .standard_profile_writer.tsv_standard_profile_writer import (
                TSVStandardProfileWriter,
            )

            return TSVStandardProfileWriter
        elif file_format is StandardProfileFileFormat.CSV:
            from .standard_profile_writer.csv_standard_profile_writer import (
                CSVStandardProfileWriter,
            )

            return CSVStandardProfileWriter
        elif file_format is StandardProfileFileFormat.XLSX:
            from .standard_profile_writer.xlsx_standard_profile_writer import (
                XLSXStandardProfileWriter,
            )

            return XLSXStandardProfileWriter
        elif file_format is StandardProfileFileFormat.ODS:
            from .standard_profile_writer.ods_standard_profile_writer import (
                ODSStandardProfileWriter,
            )

            return ODSStandardProfileWriter
        elif file_format is StandardProfileFileFormat.arrow:
            from .standard_profile_writer.arrow_standard_profile_writer import (
                ArrowStandardProfileWriter,
            )

            return ArrowStandardProfileWriter
        elif file_format is StandardProfileFileFormat.parquet:
            from .standard_profile_writer.parquet_standard_profile_writer import (
                ParquetStandardProfileWriter,
            )

            return ParquetStandardProfileWriter
        else:
            ValueError(
                f"The given file format {file_format.name} is not a supported tidy "
                f"observation table writer format."
            )

    @classmethod
    def table_reader(cls, file_format: TableReaderFileFormat) -> Type[TableReader]:
        """Return a table reader of the correct type."""
        if file_format is TableReaderFileFormat.TSV:
            from .table_reader.tsv_table_reader import TSVTableReader

            return TSVTableReader
        elif file_format is TableReaderFileFormat.CSV:
            from .table_reader.csv_table_reader import CSVTableReader

            return CSVTableReader
        elif file_format is TableReaderFileFormat.XLSX:
            from .table_reader.xlsx_table_reader import XLSXTableReader

            return XLSXTableReader
        elif file_format is TableReaderFileFormat.ODS:
            from .table_reader.ods_table_reader import ODSTableReader

            return ODSTableReader
        elif file_format is TableReaderFileFormat.arrow:
            from .table_reader.arrow_table_reader import ArrowTableReader

            return ArrowTableReader
        elif file_format is TableReaderFileFormat.parquet:
            from .table_reader.parquet_table_reader import ParquetTableReader

            return ParquetTableReader
        else:
            ValueError(
                f"The given file format {file_format.name} is not a supported table "
                f"reader format."
            )

    @classmethod
    def tidy_observation_table_writer(
        cls, file_format: TidyObservationTableFileFormat
    ) -> Type[TidyObservationTableWriter]:
        """Return a tidy table writer of the correct type."""
        if file_format is TidyObservationTableFileFormat.TSV:
            from .tidy_observation_table_writer.tsv_table_writer import (
                TSVTidyObservationTableWriter,
            )

            return TSVTidyObservationTableWriter
        elif file_format is TidyObservationTableFileFormat.CSV:
            from .tidy_observation_table_writer.csv_table_writer import (
                CSVTidyObservationTableWriter,
            )

            return CSVTidyObservationTableWriter
        elif file_format is TidyObservationTableFileFormat.XLSX:
            from .tidy_observation_table_writer.xlsx_table_writer import (
                XLSXTidyObservationTableWriter,
            )

            return XLSXTidyObservationTableWriter
        elif file_format is TidyObservationTableFileFormat.ODS:
            from .tidy_observation_table_writer.ods_table_writer import (
                ODSTidyObservationTableWriter,
            )

            return ODSTidyObservationTableWriter
        elif file_format is TidyObservationTableFileFormat.arrow:
            from .tidy_observation_table_writer.arrow_table_writer import (
                ArrowTidyObservationTableWriter,
            )

            return ArrowTidyObservationTableWriter
        elif file_format is TidyObservationTableFileFormat.parquet:
            from .tidy_observation_table_writer.parquet_table_writer import (
                ParquetTidyObservationTableWriter,
            )

            return ParquetTidyObservationTableWriter
        else:
            ValueError(
                f"The given file format {file_format.name} is not a supported tidy "
                f"observation table writer format."
            )

    @classmethod
    def wide_observation_table_writer(
        cls, file_format: WideObservationTableFileFormat
    ) -> Type[WideObservationTableWriter]:
        """Return a writer for wide observation tables in the specified format."""
        if file_format is WideObservationTableFileFormat.TSV:
            from .wide_observation_table_writer.tsv_wide_observation_table_writer import (
                TSVWideObservationTableWriter,
            )

            return TSVWideObservationTableWriter
        elif file_format is WideObservationTableFileFormat.CSV:
            from .wide_observation_table_writer.csv_wide_observation_table_writer import (
                CSVWideObservationTableWriter,
            )

            return CSVWideObservationTableWriter
        elif file_format is WideObservationTableFileFormat.XLSX:
            from .wide_observation_table_writer.xlsx_wide_observation_table_writer import (
                XLSXWideObservationTableWriter,
            )

            return XLSXWideObservationTableWriter
        elif file_format is WideObservationTableFileFormat.ODS:
            from .wide_observation_table_writer.ods_wide_observation_table_writer import (
                ODSWideObservationTableWriter,
            )

            return ODSWideObservationTableWriter
        elif file_format is WideObservationTableFileFormat.arrow:
            from .wide_observation_table_writer.arrow_wide_observation_table_writer import (
                ArrowWideObservationTableWriter,
            )

            return ArrowWideObservationTableWriter
        elif file_format is WideObservationTableFileFormat.parquet:
            from .wide_observation_table_writer.parquet_wide_observation_table_writer import (
                ParquetWideObservationTableWriter,
            )

            return ParquetWideObservationTableWriter
        elif file_format is WideObservationTableFileFormat.BIOM:
            from .wide_observation_table_writer.biom_wide_observation_table_writer import (
                BIOMWideObservationTableWriter,
            )

            return BIOMWideObservationTableWriter
        else:
            ValueError(
                f"The given file format {file_format.name} is not a supported "
                f"observation matrix writer format."
            )
Functions
profile_reader(profiler: SupportedProfiler) -> Type[ProfileReader] classmethod

Return a profile reader of the correct type.

Source code in src/taxpasta/infrastructure/application/application_service_registry.py
@classmethod
def profile_reader(cls, profiler: SupportedProfiler) -> Type[ProfileReader]:
    """Return a profile reader of the correct type."""
    if profiler is SupportedProfiler.bracken:
        from .bracken import BrackenProfileReader

        return BrackenProfileReader
    elif profiler is SupportedProfiler.centrifuge:
        from .centrifuge import CentrifugeProfileReader

        return CentrifugeProfileReader
    elif profiler is SupportedProfiler.diamond:
        from .diamond import DiamondProfileReader

        return DiamondProfileReader
    elif profiler is SupportedProfiler.ganon:
        from .ganon import GanonProfileReader

        return GanonProfileReader
    elif profiler is SupportedProfiler.kaiju:
        from .kaiju import KaijuProfileReader

        return KaijuProfileReader
    elif profiler is SupportedProfiler.kmcp:
        from .kmcp import KMCPProfileReader

        return KMCPProfileReader
    elif profiler is SupportedProfiler.kraken2:
        from .kraken2 import Kraken2ProfileReader

        return Kraken2ProfileReader
    elif profiler is SupportedProfiler.krakenuniq:
        from .krakenuniq import KrakenUniqProfileReader

        return KrakenUniqProfileReader
    elif profiler is SupportedProfiler.megan6:
        from .megan6 import Megan6ProfileReader

        return Megan6ProfileReader
    elif profiler is SupportedProfiler.metaphlan:
        from .metaphlan import MetaphlanProfileReader

        return MetaphlanProfileReader
    elif profiler is SupportedProfiler.motus:
        from .motus import MotusProfileReader

        return MotusProfileReader
profile_standardisation_service(profiler: SupportedProfiler) -> Type[ProfileStandardisationService] classmethod

Return a profile standardisation service of the correct type.

Source code in src/taxpasta/infrastructure/application/application_service_registry.py
@classmethod
def profile_standardisation_service(
    cls, profiler: SupportedProfiler
) -> Type[ProfileStandardisationService]:
    """Return a profile standardisation service of the correct type."""
    if profiler is SupportedProfiler.bracken:
        from .bracken import BrackenProfileStandardisationService

        return BrackenProfileStandardisationService
    elif profiler is SupportedProfiler.centrifuge:
        from .centrifuge import CentrifugeProfileStandardisationService

        return CentrifugeProfileStandardisationService
    elif profiler is SupportedProfiler.diamond:
        from .diamond import DiamondProfileStandardisationService

        return DiamondProfileStandardisationService
    elif profiler is SupportedProfiler.kaiju:
        from .kaiju import KaijuProfileStandardisationService

        return KaijuProfileStandardisationService
    elif profiler is SupportedProfiler.kraken2:
        from .kraken2 import Kraken2ProfileStandardisationService

        return Kraken2ProfileStandardisationService
    elif profiler is SupportedProfiler.krakenuniq:
        from .krakenuniq import KrakenUniqProfileStandardisationService

        return KrakenUniqProfileStandardisationService
    elif profiler is SupportedProfiler.megan6:
        from .megan6 import Megan6ProfileStandardisationService

        return Megan6ProfileStandardisationService
    elif profiler is SupportedProfiler.motus:
        from .motus import MotusProfileStandardisationService

        return MotusProfileStandardisationService
    elif profiler is SupportedProfiler.metaphlan:
        from .metaphlan import MetaphlanProfileStandardisationService

        return MetaphlanProfileStandardisationService
    elif profiler is SupportedProfiler.ganon:
        from .ganon import GanonProfileStandardisationService

        return GanonProfileStandardisationService
    elif profiler is SupportedProfiler.kmcp:
        from .kmcp import KMCPProfileStandardisationService

        return KMCPProfileStandardisationService

    else:
        raise ValueError("Unexpected")
standard_profile_writer(file_format: StandardProfileFileFormat) -> Type[StandardProfileWriter] classmethod

Return a standard profile writer of the correct type.

Source code in src/taxpasta/infrastructure/application/application_service_registry.py
@classmethod
def standard_profile_writer(
    cls, file_format: StandardProfileFileFormat
) -> Type[StandardProfileWriter]:
    """Return a standard profile writer of the correct type."""
    if file_format is StandardProfileFileFormat.TSV:
        from .standard_profile_writer.tsv_standard_profile_writer import (
            TSVStandardProfileWriter,
        )

        return TSVStandardProfileWriter
    elif file_format is StandardProfileFileFormat.CSV:
        from .standard_profile_writer.csv_standard_profile_writer import (
            CSVStandardProfileWriter,
        )

        return CSVStandardProfileWriter
    elif file_format is StandardProfileFileFormat.XLSX:
        from .standard_profile_writer.xlsx_standard_profile_writer import (
            XLSXStandardProfileWriter,
        )

        return XLSXStandardProfileWriter
    elif file_format is StandardProfileFileFormat.ODS:
        from .standard_profile_writer.ods_standard_profile_writer import (
            ODSStandardProfileWriter,
        )

        return ODSStandardProfileWriter
    elif file_format is StandardProfileFileFormat.arrow:
        from .standard_profile_writer.arrow_standard_profile_writer import (
            ArrowStandardProfileWriter,
        )

        return ArrowStandardProfileWriter
    elif file_format is StandardProfileFileFormat.parquet:
        from .standard_profile_writer.parquet_standard_profile_writer import (
            ParquetStandardProfileWriter,
        )

        return ParquetStandardProfileWriter
    else:
        ValueError(
            f"The given file format {file_format.name} is not a supported tidy "
            f"observation table writer format."
        )
table_reader(file_format: TableReaderFileFormat) -> Type[TableReader] classmethod

Return a table reader of the correct type.

Source code in src/taxpasta/infrastructure/application/application_service_registry.py
@classmethod
def table_reader(cls, file_format: TableReaderFileFormat) -> Type[TableReader]:
    """Return a table reader of the correct type."""
    if file_format is TableReaderFileFormat.TSV:
        from .table_reader.tsv_table_reader import TSVTableReader

        return TSVTableReader
    elif file_format is TableReaderFileFormat.CSV:
        from .table_reader.csv_table_reader import CSVTableReader

        return CSVTableReader
    elif file_format is TableReaderFileFormat.XLSX:
        from .table_reader.xlsx_table_reader import XLSXTableReader

        return XLSXTableReader
    elif file_format is TableReaderFileFormat.ODS:
        from .table_reader.ods_table_reader import ODSTableReader

        return ODSTableReader
    elif file_format is TableReaderFileFormat.arrow:
        from .table_reader.arrow_table_reader import ArrowTableReader

        return ArrowTableReader
    elif file_format is TableReaderFileFormat.parquet:
        from .table_reader.parquet_table_reader import ParquetTableReader

        return ParquetTableReader
    else:
        ValueError(
            f"The given file format {file_format.name} is not a supported table "
            f"reader format."
        )
tidy_observation_table_writer(file_format: TidyObservationTableFileFormat) -> Type[TidyObservationTableWriter] classmethod

Return a tidy table writer of the correct type.

Source code in src/taxpasta/infrastructure/application/application_service_registry.py
@classmethod
def tidy_observation_table_writer(
    cls, file_format: TidyObservationTableFileFormat
) -> Type[TidyObservationTableWriter]:
    """Return a tidy table writer of the correct type."""
    if file_format is TidyObservationTableFileFormat.TSV:
        from .tidy_observation_table_writer.tsv_table_writer import (
            TSVTidyObservationTableWriter,
        )

        return TSVTidyObservationTableWriter
    elif file_format is TidyObservationTableFileFormat.CSV:
        from .tidy_observation_table_writer.csv_table_writer import (
            CSVTidyObservationTableWriter,
        )

        return CSVTidyObservationTableWriter
    elif file_format is TidyObservationTableFileFormat.XLSX:
        from .tidy_observation_table_writer.xlsx_table_writer import (
            XLSXTidyObservationTableWriter,
        )

        return XLSXTidyObservationTableWriter
    elif file_format is TidyObservationTableFileFormat.ODS:
        from .tidy_observation_table_writer.ods_table_writer import (
            ODSTidyObservationTableWriter,
        )

        return ODSTidyObservationTableWriter
    elif file_format is TidyObservationTableFileFormat.arrow:
        from .tidy_observation_table_writer.arrow_table_writer import (
            ArrowTidyObservationTableWriter,
        )

        return ArrowTidyObservationTableWriter
    elif file_format is TidyObservationTableFileFormat.parquet:
        from .tidy_observation_table_writer.parquet_table_writer import (
            ParquetTidyObservationTableWriter,
        )

        return ParquetTidyObservationTableWriter
    else:
        ValueError(
            f"The given file format {file_format.name} is not a supported tidy "
            f"observation table writer format."
        )
wide_observation_table_writer(file_format: WideObservationTableFileFormat) -> Type[WideObservationTableWriter] classmethod

Return a writer for wide observation tables in the specified format.

Source code in src/taxpasta/infrastructure/application/application_service_registry.py
@classmethod
def wide_observation_table_writer(
    cls, file_format: WideObservationTableFileFormat
) -> Type[WideObservationTableWriter]:
    """Return a writer for wide observation tables in the specified format."""
    if file_format is WideObservationTableFileFormat.TSV:
        from .wide_observation_table_writer.tsv_wide_observation_table_writer import (
            TSVWideObservationTableWriter,
        )

        return TSVWideObservationTableWriter
    elif file_format is WideObservationTableFileFormat.CSV:
        from .wide_observation_table_writer.csv_wide_observation_table_writer import (
            CSVWideObservationTableWriter,
        )

        return CSVWideObservationTableWriter
    elif file_format is WideObservationTableFileFormat.XLSX:
        from .wide_observation_table_writer.xlsx_wide_observation_table_writer import (
            XLSXWideObservationTableWriter,
        )

        return XLSXWideObservationTableWriter
    elif file_format is WideObservationTableFileFormat.ODS:
        from .wide_observation_table_writer.ods_wide_observation_table_writer import (
            ODSWideObservationTableWriter,
        )

        return ODSWideObservationTableWriter
    elif file_format is WideObservationTableFileFormat.arrow:
        from .wide_observation_table_writer.arrow_wide_observation_table_writer import (
            ArrowWideObservationTableWriter,
        )

        return ArrowWideObservationTableWriter
    elif file_format is WideObservationTableFileFormat.parquet:
        from .wide_observation_table_writer.parquet_wide_observation_table_writer import (
            ParquetWideObservationTableWriter,
        )

        return ParquetWideObservationTableWriter
    elif file_format is WideObservationTableFileFormat.BIOM:
        from .wide_observation_table_writer.biom_wide_observation_table_writer import (
            BIOMWideObservationTableWriter,
        )

        return BIOMWideObservationTableWriter
    else:
        ValueError(
            f"The given file format {file_format.name} is not a supported "
            f"observation matrix writer format."
        )
bracken
Classes
Modules
bracken_profile

Provide a description of the Bracken profile format.

Attributes
BRACKEN_FRACTION_TOLERANCE = 0.01 module-attribute
BRACKEN_FRACTION_TOTAL = 1.0 module-attribute
Classes
BrackenProfile

Bases: BaseDataFrameModel

Define the expected Bracken profile format.

Source code in src/taxpasta/infrastructure/application/bracken/bracken_profile.py
class BrackenProfile(BaseDataFrameModel):
    """Define the expected Bracken profile format."""

    name: Series[str] = pa.Field()
    taxonomy_id: Series[int] = pa.Field(ge=0)
    taxonomy_lvl: Series[str] = pa.Field()
    kraken_assigned_reads: Series[int] = pa.Field(ge=0)
    added_reads: Series[int] = pa.Field(ge=0)
    new_est_reads: Series[int] = pa.Field(ge=0)
    fraction_total_reads: Series[float] = pa.Field(ge=0.0, le=1.0)

    @pa.check("fraction_total_reads", name="compositionality", raise_warning=True)
    def check_compositionality(cls, fraction_total_reads: Series[float]) -> bool:
        """Check that the fractions of reads add up to one."""
        # Bracken reports fractions with five decimals but rounding errors accumulate.
        return fraction_total_reads.empty or bool(
            np.isclose(
                fraction_total_reads.sum(),
                BRACKEN_FRACTION_TOTAL,
                atol=BRACKEN_FRACTION_TOLERANCE,
            )
        )

    @pa.dataframe_check
    def check_added_reads_consistency(cls, profile: DataFrame) -> Series[bool]:
        """Check that Bracken added reads are consistent."""
        return (
            profile[cls.kraken_assigned_reads] + profile[cls.added_reads]
            == profile[cls.new_est_reads]
        )
Attributes
added_reads: Series[int] = pa.Field(ge=0) class-attribute instance-attribute
fraction_total_reads: Series[float] = pa.Field(ge=0.0, le=1.0) class-attribute instance-attribute
kraken_assigned_reads: Series[int] = pa.Field(ge=0) class-attribute instance-attribute
name: Series[str] = pa.Field() class-attribute instance-attribute
new_est_reads: Series[int] = pa.Field(ge=0) class-attribute instance-attribute
taxonomy_id: Series[int] = pa.Field(ge=0) class-attribute instance-attribute
taxonomy_lvl: Series[str] = pa.Field() class-attribute instance-attribute
Functions
check_added_reads_consistency(profile: DataFrame) -> Series[bool]

Check that Bracken added reads are consistent.

Source code in src/taxpasta/infrastructure/application/bracken/bracken_profile.py
@pa.dataframe_check
def check_added_reads_consistency(cls, profile: DataFrame) -> Series[bool]:
    """Check that Bracken added reads are consistent."""
    return (
        profile[cls.kraken_assigned_reads] + profile[cls.added_reads]
        == profile[cls.new_est_reads]
    )
check_compositionality(fraction_total_reads: Series[float]) -> bool

Check that the fractions of reads add up to one.

Source code in src/taxpasta/infrastructure/application/bracken/bracken_profile.py
@pa.check("fraction_total_reads", name="compositionality", raise_warning=True)
def check_compositionality(cls, fraction_total_reads: Series[float]) -> bool:
    """Check that the fractions of reads add up to one."""
    # Bracken reports fractions with five decimals but rounding errors accumulate.
    return fraction_total_reads.empty or bool(
        np.isclose(
            fraction_total_reads.sum(),
            BRACKEN_FRACTION_TOTAL,
            atol=BRACKEN_FRACTION_TOLERANCE,
        )
    )
bracken_profile_reader

Provide a reader for Bracken profiles.

Attributes Classes
BrackenProfileReader

Bases: ProfileReader

Define a reader for Bracken profiles.

Source code in src/taxpasta/infrastructure/application/bracken/bracken_profile_reader.py
class BrackenProfileReader(ProfileReader):
    """Define a reader for Bracken profiles."""

    @classmethod
    @raise_parser_warnings
    def read(cls, profile: BufferOrFilepath) -> DataFrame[BrackenProfile]:
        """
        Read a Bracken taxonomic profile from the given source.

        Args:
            profile: A source that contains a tab-separated taxonomic profile generated
                by Bracken.

        Returns:
            A data frame representation of the Bracken profile.

        """
        result = pd.read_table(
            filepath_or_buffer=profile,
            sep="\t",
            index_col=False,
            skipinitialspace=True,
        )
        cls._check_num_columns(result, BrackenProfile)
        return result
Functions
read(profile: BufferOrFilepath) -> DataFrame[BrackenProfile] classmethod

Read a Bracken taxonomic profile from the given source.

Parameters:

Name Type Description Default
profile BufferOrFilepath

A source that contains a tab-separated taxonomic profile generated by Bracken.

required

Returns:

Type Description
DataFrame[BrackenProfile]

A data frame representation of the Bracken profile.

Source code in src/taxpasta/infrastructure/application/bracken/bracken_profile_reader.py
@classmethod
@raise_parser_warnings
def read(cls, profile: BufferOrFilepath) -> DataFrame[BrackenProfile]:
    """
    Read a Bracken taxonomic profile from the given source.

    Args:
        profile: A source that contains a tab-separated taxonomic profile generated
            by Bracken.

    Returns:
        A data frame representation of the Bracken profile.

    """
    result = pd.read_table(
        filepath_or_buffer=profile,
        sep="\t",
        index_col=False,
        skipinitialspace=True,
    )
    cls._check_num_columns(result, BrackenProfile)
    return result
Functions
bracken_profile_standardisation_service

Provide a standardisation service for Bracken profiles.

Classes
BrackenProfileStandardisationService

Bases: ProfileStandardisationService

Define a standardisation service for Bracken profiles.

Source code in src/taxpasta/infrastructure/application/bracken/bracken_profile_standardisation_service.py
class BrackenProfileStandardisationService(ProfileStandardisationService):
    """Define a standardisation service for Bracken profiles."""

    @classmethod
    @pa.check_types(lazy=True)
    def transform(
        cls, profile: DataFrame[BrackenProfile]
    ) -> DataFrame[StandardProfile]:
        """
        Tidy up and standardize a given Bracken profile.

        Args:
            profile: A taxonomic profile generated by Bracken.

        Returns:
            A standardized profile.

        Raises:
            pandera.errors.SchemaErrors: If the given profile does not conform with the
                `BrackenProfile` or the transformed output does not conform with the
                `StandardProfile`.  # noqa: DAR402

        """
        return (
            profile[[BrackenProfile.taxonomy_id, BrackenProfile.new_est_reads]]
            .copy()
            .rename(
                columns={
                    BrackenProfile.taxonomy_id: StandardProfile.taxonomy_id,
                    BrackenProfile.new_est_reads: StandardProfile.count,
                }
            )
        )
Functions
transform(profile: DataFrame[BrackenProfile]) -> DataFrame[StandardProfile] classmethod

Tidy up and standardize a given Bracken profile.

Parameters:

Name Type Description Default
profile DataFrame[BrackenProfile]

A taxonomic profile generated by Bracken.

required

Returns:

Type Description
DataFrame[StandardProfile]

A standardized profile.

Raises:

Type Description
SchemaErrors

If the given profile does not conform with the BrackenProfile or the transformed output does not conform with the StandardProfile. # noqa: DAR402

Source code in src/taxpasta/infrastructure/application/bracken/bracken_profile_standardisation_service.py
@classmethod
@pa.check_types(lazy=True)
def transform(
    cls, profile: DataFrame[BrackenProfile]
) -> DataFrame[StandardProfile]:
    """
    Tidy up and standardize a given Bracken profile.

    Args:
        profile: A taxonomic profile generated by Bracken.

    Returns:
        A standardized profile.

    Raises:
        pandera.errors.SchemaErrors: If the given profile does not conform with the
            `BrackenProfile` or the transformed output does not conform with the
            `StandardProfile`.  # noqa: DAR402

    """
    return (
        profile[[BrackenProfile.taxonomy_id, BrackenProfile.new_est_reads]]
        .copy()
        .rename(
            columns={
                BrackenProfile.taxonomy_id: StandardProfile.taxonomy_id,
                BrackenProfile.new_est_reads: StandardProfile.count,
            }
        )
    )
centrifuge
Classes
Modules
centrifuge_profile

Provide a description of the centrifuge profile format.

Attributes
CENTRIFUGE_PERCENT_TOLERANCE = 1.0 module-attribute
CENTRIFUGE_PERCENT_TOTAL = 100.0 module-attribute
Classes
CentrifugeProfile

Bases: BaseDataFrameModel

Define the expected centrifuge profile format.

Source code in src/taxpasta/infrastructure/application/centrifuge/centrifuge_profile.py
class CentrifugeProfile(BaseDataFrameModel):
    """Define the expected centrifuge profile format."""

    percent: Series[float] = pa.Field(ge=0.0, le=100.0)
    clade_assigned_reads: Series[int] = pa.Field(ge=0)
    direct_assigned_reads: Series[int] = pa.Field(ge=0)
    taxonomy_level: Series[str] = pa.Field()
    taxonomy_id: Series[int] = pa.Field(ge=0)
    name: Series[str] = pa.Field()

    @pa.check("percent", name="compositionality", raise_warning=True)
    def check_compositionality(cls, percent: Series[float]) -> bool:
        """Check that the percent of 'unclassified' and 'root' add up to a hundred."""
        return percent.empty or bool(
            np.isclose(
                percent[:2].sum(),
                CENTRIFUGE_PERCENT_TOTAL,
                atol=CENTRIFUGE_PERCENT_TOLERANCE,
            )
        )
Attributes
clade_assigned_reads: Series[int] = pa.Field(ge=0) class-attribute instance-attribute
direct_assigned_reads: Series[int] = pa.Field(ge=0) class-attribute instance-attribute
name: Series[str] = pa.Field() class-attribute instance-attribute
percent: Series[float] = pa.Field(ge=0.0, le=100.0) class-attribute instance-attribute
taxonomy_id: Series[int] = pa.Field(ge=0) class-attribute instance-attribute
taxonomy_level: Series[str] = pa.Field() class-attribute instance-attribute
Functions
check_compositionality(percent: Series[float]) -> bool

Check that the percent of 'unclassified' and 'root' add up to a hundred.

Source code in src/taxpasta/infrastructure/application/centrifuge/centrifuge_profile.py
@pa.check("percent", name="compositionality", raise_warning=True)
def check_compositionality(cls, percent: Series[float]) -> bool:
    """Check that the percent of 'unclassified' and 'root' add up to a hundred."""
    return percent.empty or bool(
        np.isclose(
            percent[:2].sum(),
            CENTRIFUGE_PERCENT_TOTAL,
            atol=CENTRIFUGE_PERCENT_TOLERANCE,
        )
    )
centrifuge_profile_reader

Provide a reader for Centrifuge profiles.

Attributes Classes
CentrifugeProfileReader

Bases: ProfileReader

Define a reader for centrifuge profiles.

Source code in src/taxpasta/infrastructure/application/centrifuge/centrifuge_profile_reader.py
class CentrifugeProfileReader(ProfileReader):
    """Define a reader for centrifuge profiles."""

    @classmethod
    @raise_parser_warnings
    def read(cls, profile: BufferOrFilepath) -> DataFrame[CentrifugeProfile]:
        """
        Read a centrifuge taxonomic profile from the given source.

        Args:
            profile: A source that contains a tab-separated taxonomic profile generated
                by centrifuge.

        Returns:
            A data frame representation of the centrifuge profile.

        """
        result = pd.read_table(
            filepath_or_buffer=profile,
            sep="\t",
            header=None,
            index_col=False,
            skipinitialspace=True,
        )
        cls._check_num_columns(result, CentrifugeProfile)
        result.columns = [
            CentrifugeProfile.percent,
            CentrifugeProfile.clade_assigned_reads,
            CentrifugeProfile.direct_assigned_reads,
            CentrifugeProfile.taxonomy_level,
            CentrifugeProfile.taxonomy_id,
            CentrifugeProfile.name,
        ]
        return result
Functions
read(profile: BufferOrFilepath) -> DataFrame[CentrifugeProfile] classmethod

Read a centrifuge taxonomic profile from the given source.

Parameters:

Name Type Description Default
profile BufferOrFilepath

A source that contains a tab-separated taxonomic profile generated by centrifuge.

required

Returns:

Type Description
DataFrame[CentrifugeProfile]

A data frame representation of the centrifuge profile.

Source code in src/taxpasta/infrastructure/application/centrifuge/centrifuge_profile_reader.py
@classmethod
@raise_parser_warnings
def read(cls, profile: BufferOrFilepath) -> DataFrame[CentrifugeProfile]:
    """
    Read a centrifuge taxonomic profile from the given source.

    Args:
        profile: A source that contains a tab-separated taxonomic profile generated
            by centrifuge.

    Returns:
        A data frame representation of the centrifuge profile.

    """
    result = pd.read_table(
        filepath_or_buffer=profile,
        sep="\t",
        header=None,
        index_col=False,
        skipinitialspace=True,
    )
    cls._check_num_columns(result, CentrifugeProfile)
    result.columns = [
        CentrifugeProfile.percent,
        CentrifugeProfile.clade_assigned_reads,
        CentrifugeProfile.direct_assigned_reads,
        CentrifugeProfile.taxonomy_level,
        CentrifugeProfile.taxonomy_id,
        CentrifugeProfile.name,
    ]
    return result
Functions
centrifuge_profile_standardisation_service

Provide a standardisation service for centrifuge profiles.

Attributes
logger = logging.getLogger(__name__) module-attribute
Classes
CentrifugeProfileStandardisationService

Bases: ProfileStandardisationService

Define a standardisation service for centrifuge profiles.

Source code in src/taxpasta/infrastructure/application/centrifuge/centrifuge_profile_standardisation_service.py
class CentrifugeProfileStandardisationService(ProfileStandardisationService):
    """Define a standardisation service for centrifuge profiles."""

    @classmethod
    @pa.check_types(lazy=True)
    def transform(
        cls, profile: DataFrame[CentrifugeProfile]
    ) -> DataFrame[StandardProfile]:
        """
        Tidy up and standardize a given centrifuge profile.

        Args:
            profile: A taxonomic profile generated by centrifuge.

        Returns:
            A standardized profile.

        """
        return (
            profile[
                [CentrifugeProfile.taxonomy_id, CentrifugeProfile.direct_assigned_reads]
            ]
            .copy()
            .rename(
                columns={
                    CentrifugeProfile.taxonomy_id: StandardProfile.taxonomy_id,
                    CentrifugeProfile.direct_assigned_reads: StandardProfile.count,
                }
            )
        )
Functions
transform(profile: DataFrame[CentrifugeProfile]) -> DataFrame[StandardProfile] classmethod

Tidy up and standardize a given centrifuge profile.

Parameters:

Name Type Description Default
profile DataFrame[CentrifugeProfile]

A taxonomic profile generated by centrifuge.

required

Returns:

Type Description
DataFrame[StandardProfile]

A standardized profile.

Source code in src/taxpasta/infrastructure/application/centrifuge/centrifuge_profile_standardisation_service.py
@classmethod
@pa.check_types(lazy=True)
def transform(
    cls, profile: DataFrame[CentrifugeProfile]
) -> DataFrame[StandardProfile]:
    """
    Tidy up and standardize a given centrifuge profile.

    Args:
        profile: A taxonomic profile generated by centrifuge.

    Returns:
        A standardized profile.

    """
    return (
        profile[
            [CentrifugeProfile.taxonomy_id, CentrifugeProfile.direct_assigned_reads]
        ]
        .copy()
        .rename(
            columns={
                CentrifugeProfile.taxonomy_id: StandardProfile.taxonomy_id,
                CentrifugeProfile.direct_assigned_reads: StandardProfile.count,
            }
        )
    )
diamond
Classes
Modules
diamond_profile

Provide a description of the diamond profile format.

Classes
DiamondProfile

Bases: BaseDataFrameModel

Define the expected diamond profile format.

Source code in src/taxpasta/infrastructure/application/diamond/diamond_profile.py
class DiamondProfile(BaseDataFrameModel):
    """Define the expected diamond profile format."""

    query_id: Series[str] = pa.Field()
    taxonomy_id: Series[int] = pa.Field(ge=0)
    e_value: Series[float] = pa.Field(ge=0.0, le=1.0)
Attributes
e_value: Series[float] = pa.Field(ge=0.0, le=1.0) class-attribute instance-attribute
query_id: Series[str] = pa.Field() class-attribute instance-attribute
taxonomy_id: Series[int] = pa.Field(ge=0) class-attribute instance-attribute
diamond_profile_reader

Provide a reader for diamond profiles.

Attributes Classes
DiamondProfileReader

Bases: ProfileReader

Define a reader for Diamond profiles.

Source code in src/taxpasta/infrastructure/application/diamond/diamond_profile_reader.py
class DiamondProfileReader(ProfileReader):
    """Define a reader for Diamond profiles."""

    @classmethod
    @raise_parser_warnings
    def read(cls, profile: BufferOrFilepath) -> DataFrame[DiamondProfile]:
        """Read a diamond taxonomic profile from a file."""
        result = pd.read_table(
            filepath_or_buffer=profile,
            sep="\t",
            header=None,
            index_col=False,
            dtype={2: float},
        )
        cls._check_num_columns(result, DiamondProfile)
        result.columns = [
            DiamondProfile.query_id,
            DiamondProfile.taxonomy_id,
            DiamondProfile.e_value,
        ]
        return result
Functions
read(profile: BufferOrFilepath) -> DataFrame[DiamondProfile] classmethod

Read a diamond taxonomic profile from a file.

Source code in src/taxpasta/infrastructure/application/diamond/diamond_profile_reader.py
@classmethod
@raise_parser_warnings
def read(cls, profile: BufferOrFilepath) -> DataFrame[DiamondProfile]:
    """Read a diamond taxonomic profile from a file."""
    result = pd.read_table(
        filepath_or_buffer=profile,
        sep="\t",
        header=None,
        index_col=False,
        dtype={2: float},
    )
    cls._check_num_columns(result, DiamondProfile)
    result.columns = [
        DiamondProfile.query_id,
        DiamondProfile.taxonomy_id,
        DiamondProfile.e_value,
    ]
    return result
Functions
diamond_profile_standardisation_service

Provide a standardisation service for diamond profiles.

Classes
DiamondProfileStandardisationService

Bases: ProfileStandardisationService

Define a standardisation service for diamond profiles.

Source code in src/taxpasta/infrastructure/application/diamond/diamond_profile_standardisation_service.py
class DiamondProfileStandardisationService(ProfileStandardisationService):
    """Define a standardisation service for diamond profiles."""

    @classmethod
    @pa.check_types(lazy=True)
    def transform(
        cls, profile: DataFrame[DiamondProfile]
    ) -> DataFrame[StandardProfile]:
        """
        Tidy up and standardize a given diamond profile.

        Args:
            profile: A taxonomic profile generated by diamond.

        Returns:
            A standardized profile.

        """
        # Sum up occurrences of taxonomy identifiers to yield read count.
        return (
            profile[[DiamondProfile.taxonomy_id]]
            .groupby(DiamondProfile.taxonomy_id, sort=False)
            .size()
            .reset_index()
            .rename(
                columns={
                    DiamondProfile.taxonomy_id: StandardProfile.taxonomy_id,
                    0: StandardProfile.count,
                }
            )
        )
Functions
transform(profile: DataFrame[DiamondProfile]) -> DataFrame[StandardProfile] classmethod

Tidy up and standardize a given diamond profile.

Parameters:

Name Type Description Default
profile DataFrame[DiamondProfile]

A taxonomic profile generated by diamond.

required

Returns:

Type Description
DataFrame[StandardProfile]

A standardized profile.

Source code in src/taxpasta/infrastructure/application/diamond/diamond_profile_standardisation_service.py
@classmethod
@pa.check_types(lazy=True)
def transform(
    cls, profile: DataFrame[DiamondProfile]
) -> DataFrame[StandardProfile]:
    """
    Tidy up and standardize a given diamond profile.

    Args:
        profile: A taxonomic profile generated by diamond.

    Returns:
        A standardized profile.

    """
    # Sum up occurrences of taxonomy identifiers to yield read count.
    return (
        profile[[DiamondProfile.taxonomy_id]]
        .groupby(DiamondProfile.taxonomy_id, sort=False)
        .size()
        .reset_index()
        .rename(
            columns={
                DiamondProfile.taxonomy_id: StandardProfile.taxonomy_id,
                0: StandardProfile.count,
            }
        )
    )
ganon
Classes
Modules
ganon_profile

Provide a description of the ganon profile format.

Attributes
GANON_PERCENT_TOLERANCE = 1.0 module-attribute
GANON_PERCENT_TOTAL = 100.0 module-attribute
Classes
GanonProfile

Bases: BaseDataFrameModel

Define the expected ganon profile format.

Source code in src/taxpasta/infrastructure/application/ganon/ganon_profile.py
class GanonProfile(BaseDataFrameModel):
    """Define the expected ganon profile format."""

    rank: Series[str] = pa.Field()
    target: Series[str] = pa.Field()
    lineage: Series[str] = pa.Field()
    name: Series[str] = pa.Field()
    number_unique: Series[int] = pa.Field(ge=0)
    number_shared: Series[int] = pa.Field(ge=0)
    number_children: Series[int] = pa.Field(ge=0)
    number_cumulative: Series[int] = pa.Field(ge=0)
    percent_cumulative: Series[float] = pa.Field(ge=0.0, le=100.0)

    @pa.dataframe_check(name="compositionality", raise_warning=True)
    def check_compositionality(cls, profile: pd.DataFrame) -> bool:
        """Check that the percent of 'unclassified' and 'root' add up to a hundred."""
        # Ganon reports percentage to 5 decimal places, but rounding errors do add up.
        return profile.empty or bool(
            np.isclose(
                profile.loc[
                    profile[cls.rank].isin(["unclassified", "root"]),
                    cls.percent_cumulative,
                ].sum(),
                GANON_PERCENT_TOTAL,
                atol=GANON_PERCENT_TOLERANCE,
            )
        )
Attributes
lineage: Series[str] = pa.Field() class-attribute instance-attribute
name: Series[str] = pa.Field() class-attribute instance-attribute
number_children: Series[int] = pa.Field(ge=0) class-attribute instance-attribute
number_cumulative: Series[int] = pa.Field(ge=0) class-attribute instance-attribute
number_shared: Series[int] = pa.Field(ge=0) class-attribute instance-attribute
number_unique: Series[int] = pa.Field(ge=0) class-attribute instance-attribute
percent_cumulative: Series[float] = pa.Field(ge=0.0, le=100.0) class-attribute instance-attribute
rank: Series[str] = pa.Field() class-attribute instance-attribute
target: Series[str] = pa.Field() class-attribute instance-attribute
Functions
check_compositionality(profile: pd.DataFrame) -> bool

Check that the percent of 'unclassified' and 'root' add up to a hundred.

Source code in src/taxpasta/infrastructure/application/ganon/ganon_profile.py
@pa.dataframe_check(name="compositionality", raise_warning=True)
def check_compositionality(cls, profile: pd.DataFrame) -> bool:
    """Check that the percent of 'unclassified' and 'root' add up to a hundred."""
    # Ganon reports percentage to 5 decimal places, but rounding errors do add up.
    return profile.empty or bool(
        np.isclose(
            profile.loc[
                profile[cls.rank].isin(["unclassified", "root"]),
                cls.percent_cumulative,
            ].sum(),
            GANON_PERCENT_TOTAL,
            atol=GANON_PERCENT_TOLERANCE,
        )
    )
ganon_profile_reader

Provide a reader for ganon profiles.

Attributes Classes
GanonProfileReader

Bases: ProfileReader

Define a reader for ganon profiles.

Source code in src/taxpasta/infrastructure/application/ganon/ganon_profile_reader.py
class GanonProfileReader(ProfileReader):
    """Define a reader for ganon profiles."""

    @classmethod
    @raise_parser_warnings
    def read(cls, profile: BufferOrFilepath) -> DataFrame[GanonProfile]:
        """
        Read a ganon taxonomic profile from the given source.

        Args:
            profile: A source that contains a tab-separated taxonomic profile generated
                by ganon.

        Returns:
            A data frame representation of the ganon profile.

        """
        result = pd.read_table(
            filepath_or_buffer=profile,
            sep="\t",
            header=None,
            index_col=False,
            skipinitialspace=True,
        )
        cls._check_num_columns(result, GanonProfile)
        result.columns = [
            GanonProfile.rank,
            GanonProfile.target,
            GanonProfile.lineage,
            GanonProfile.name,
            GanonProfile.number_unique,
            GanonProfile.number_shared,
            GanonProfile.number_children,
            GanonProfile.number_cumulative,
            GanonProfile.percent_cumulative,
        ]
        return result
Functions
read(profile: BufferOrFilepath) -> DataFrame[GanonProfile] classmethod

Read a ganon taxonomic profile from the given source.

Parameters:

Name Type Description Default
profile BufferOrFilepath

A source that contains a tab-separated taxonomic profile generated by ganon.

required

Returns:

Type Description
DataFrame[GanonProfile]

A data frame representation of the ganon profile.

Source code in src/taxpasta/infrastructure/application/ganon/ganon_profile_reader.py
@classmethod
@raise_parser_warnings
def read(cls, profile: BufferOrFilepath) -> DataFrame[GanonProfile]:
    """
    Read a ganon taxonomic profile from the given source.

    Args:
        profile: A source that contains a tab-separated taxonomic profile generated
            by ganon.

    Returns:
        A data frame representation of the ganon profile.

    """
    result = pd.read_table(
        filepath_or_buffer=profile,
        sep="\t",
        header=None,
        index_col=False,
        skipinitialspace=True,
    )
    cls._check_num_columns(result, GanonProfile)
    result.columns = [
        GanonProfile.rank,
        GanonProfile.target,
        GanonProfile.lineage,
        GanonProfile.name,
        GanonProfile.number_unique,
        GanonProfile.number_shared,
        GanonProfile.number_children,
        GanonProfile.number_cumulative,
        GanonProfile.percent_cumulative,
    ]
    return result
Functions
ganon_profile_standardisation_service

Provide a standardisation service for ganon profiles.

Attributes
logger = logging.getLogger(__name__) module-attribute
Classes
GanonProfileStandardisationService

Bases: ProfileStandardisationService

Define a standardisation service for ganon profiles.

Source code in src/taxpasta/infrastructure/application/ganon/ganon_profile_standardisation_service.py
class GanonProfileStandardisationService(ProfileStandardisationService):
    """Define a standardisation service for ganon profiles."""

    @classmethod
    @pa.check_types(lazy=True)
    def transform(cls, profile: DataFrame[GanonProfile]) -> DataFrame[StandardProfile]:
        """
        Tidy up and standardize a given ganon profile.

        Args:
            profile: A taxonomic profile generated by ganon.

        Returns:
            A standardized profile.

        """
        # Select unclassified entries, rename columns, assign taxonomy ID zero, and
        #  sum up counts.
        unclassified = (
            profile.loc[
                profile[GanonProfile.target] == "-",
                [GanonProfile.target, GanonProfile.number_unique],
            ]
            .copy()
            .rename(
                columns={
                    GanonProfile.target: StandardProfile.taxonomy_id,
                    GanonProfile.number_unique: StandardProfile.count,
                }
            )
            .assign(**{StandardProfile.taxonomy_id: 0})
            .groupby(StandardProfile.taxonomy_id, as_index=False, sort=False)
            .sum()
        )
        # Select classified entries, rename columns, and convert taxonomy ID to integer.
        classified = (
            profile.loc[
                profile[GanonProfile.target] != "-",
                [GanonProfile.target, GanonProfile.number_unique],
            ]
            .copy()
            .rename(
                columns={
                    GanonProfile.target: StandardProfile.taxonomy_id,
                    GanonProfile.number_unique: StandardProfile.count,
                }
            )
            .assign(
                **{
                    StandardProfile.taxonomy_id: lambda df: df[
                        StandardProfile.taxonomy_id
                    ].astype(int)
                }
            )
        )
        return pd.concat([unclassified, classified], ignore_index=True)
Functions
transform(profile: DataFrame[GanonProfile]) -> DataFrame[StandardProfile] classmethod

Tidy up and standardize a given ganon profile.

Parameters:

Name Type Description Default
profile DataFrame[GanonProfile]

A taxonomic profile generated by ganon.

required

Returns:

Type Description
DataFrame[StandardProfile]

A standardized profile.

Source code in src/taxpasta/infrastructure/application/ganon/ganon_profile_standardisation_service.py
@classmethod
@pa.check_types(lazy=True)
def transform(cls, profile: DataFrame[GanonProfile]) -> DataFrame[StandardProfile]:
    """
    Tidy up and standardize a given ganon profile.

    Args:
        profile: A taxonomic profile generated by ganon.

    Returns:
        A standardized profile.

    """
    # Select unclassified entries, rename columns, assign taxonomy ID zero, and
    #  sum up counts.
    unclassified = (
        profile.loc[
            profile[GanonProfile.target] == "-",
            [GanonProfile.target, GanonProfile.number_unique],
        ]
        .copy()
        .rename(
            columns={
                GanonProfile.target: StandardProfile.taxonomy_id,
                GanonProfile.number_unique: StandardProfile.count,
            }
        )
        .assign(**{StandardProfile.taxonomy_id: 0})
        .groupby(StandardProfile.taxonomy_id, as_index=False, sort=False)
        .sum()
    )
    # Select classified entries, rename columns, and convert taxonomy ID to integer.
    classified = (
        profile.loc[
            profile[GanonProfile.target] != "-",
            [GanonProfile.target, GanonProfile.number_unique],
        ]
        .copy()
        .rename(
            columns={
                GanonProfile.target: StandardProfile.taxonomy_id,
                GanonProfile.number_unique: StandardProfile.count,
            }
        )
        .assign(
            **{
                StandardProfile.taxonomy_id: lambda df: df[
                    StandardProfile.taxonomy_id
                ].astype(int)
            }
        )
    )
    return pd.concat([unclassified, classified], ignore_index=True)
kaiju
Classes
Modules
kaiju_profile

Provide a description of the kaiju profile format.

Attributes
KAIJU_PERCENT_TOLERANCE = 1.0 module-attribute
KAIJU_PERCENT_TOTAL = 100.0 module-attribute
Classes
KaijuProfile

Bases: BaseDataFrameModel

Define the expected kaiju profile format.

Source code in src/taxpasta/infrastructure/application/kaiju/kaiju_profile.py
class KaijuProfile(BaseDataFrameModel):
    """Define the expected kaiju profile format."""

    file: Series[str] = pa.Field()
    percent: Series[float] = pa.Field(ge=0.0, le=100.0)
    reads: Series[int] = pa.Field(ge=0)
    taxon_id: Series[pd.Int64Dtype] = pa.Field(nullable=True)
    taxon_name: Series[str] = pa.Field()

    @pa.check("percent", name="compositionality", raise_warning=True)
    def check_compositionality(cls, percent: Series[float]) -> bool:
        """Check that the percentages add up to a hundred."""
        # Kaiju reports percentages with sixth decimals
        return percent.empty or bool(
            np.isclose(percent.sum(), KAIJU_PERCENT_TOTAL, atol=KAIJU_PERCENT_TOLERANCE)
        )

    @pa.check("file", name="unique_filename")
    def check_unique_filename(cls, file_col: Series[str]) -> bool:
        """Check that Kaiju filename is unique."""
        return file_col.empty or file_col.nunique() == 1
Attributes
file: Series[str] = pa.Field() class-attribute instance-attribute
percent: Series[float] = pa.Field(ge=0.0, le=100.0) class-attribute instance-attribute
reads: Series[int] = pa.Field(ge=0) class-attribute instance-attribute
taxon_id: Series[pd.Int64Dtype] = pa.Field(nullable=True) class-attribute instance-attribute
taxon_name: Series[str] = pa.Field() class-attribute instance-attribute
Functions
check_compositionality(percent: Series[float]) -> bool

Check that the percentages add up to a hundred.

Source code in src/taxpasta/infrastructure/application/kaiju/kaiju_profile.py
@pa.check("percent", name="compositionality", raise_warning=True)
def check_compositionality(cls, percent: Series[float]) -> bool:
    """Check that the percentages add up to a hundred."""
    # Kaiju reports percentages with sixth decimals
    return percent.empty or bool(
        np.isclose(percent.sum(), KAIJU_PERCENT_TOTAL, atol=KAIJU_PERCENT_TOLERANCE)
    )
check_unique_filename(file_col: Series[str]) -> bool

Check that Kaiju filename is unique.

Source code in src/taxpasta/infrastructure/application/kaiju/kaiju_profile.py
@pa.check("file", name="unique_filename")
def check_unique_filename(cls, file_col: Series[str]) -> bool:
    """Check that Kaiju filename is unique."""
    return file_col.empty or file_col.nunique() == 1
kaiju_profile_reader

Provide a reader for kaiju profiles.

Attributes Classes
KaijuProfileReader

Bases: ProfileReader

Define a reader for kaiju profiles.

Source code in src/taxpasta/infrastructure/application/kaiju/kaiju_profile_reader.py
class KaijuProfileReader(ProfileReader):
    """Define a reader for kaiju profiles."""

    @classmethod
    @raise_parser_warnings
    def read(cls, profile: BufferOrFilepath) -> DataFrame[KaijuProfile]:
        """
        Read a kaiju taxonomic profile from the given source.

        Args:
            profile: A source that contains a tab-separated taxonomic profile generated
                by kaiju.

        Returns:
            A data frame representation of the kaiju profile.

        """
        result = pd.read_table(
            filepath_or_buffer=profile,
            sep="\t",
            header=0,
            index_col=False,
            dtype={KaijuProfile.taxon_id: "Int64"},
        )
        cls._check_num_columns(result, KaijuProfile)
        return result
Functions
read(profile: BufferOrFilepath) -> DataFrame[KaijuProfile] classmethod

Read a kaiju taxonomic profile from the given source.

Parameters:

Name Type Description Default
profile BufferOrFilepath

A source that contains a tab-separated taxonomic profile generated by kaiju.

required

Returns:

Type Description
DataFrame[KaijuProfile]

A data frame representation of the kaiju profile.

Source code in src/taxpasta/infrastructure/application/kaiju/kaiju_profile_reader.py
@classmethod
@raise_parser_warnings
def read(cls, profile: BufferOrFilepath) -> DataFrame[KaijuProfile]:
    """
    Read a kaiju taxonomic profile from the given source.

    Args:
        profile: A source that contains a tab-separated taxonomic profile generated
            by kaiju.

    Returns:
        A data frame representation of the kaiju profile.

    """
    result = pd.read_table(
        filepath_or_buffer=profile,
        sep="\t",
        header=0,
        index_col=False,
        dtype={KaijuProfile.taxon_id: "Int64"},
    )
    cls._check_num_columns(result, KaijuProfile)
    return result
Functions
kaiju_profile_standardisation_service

Provide a standardisation service for kaiju profiles.

Classes
KaijuProfileStandardisationService

Bases: ProfileStandardisationService

Define a standardisation service for kaiju profiles.

Source code in src/taxpasta/infrastructure/application/kaiju/kaiju_profile_standardisation_service.py
class KaijuProfileStandardisationService(ProfileStandardisationService):
    """Define a standardisation service for kaiju profiles."""

    @classmethod
    @pa.check_types(lazy=True)
    def transform(cls, profile: DataFrame[KaijuProfile]) -> DataFrame[StandardProfile]:
        """
        Tidy up and standardize a given kaiju profile.

        Args:
            profile: A taxonomic profile generated by kaiju.

        Returns:
            A standardized profile.

        """
        temp = (
            profile[[KaijuProfile.taxon_id, KaijuProfile.reads]]
            .copy()
            .rename(
                columns={
                    KaijuProfile.taxon_id: StandardProfile.taxonomy_id,
                    KaijuProfile.reads: StandardProfile.count,
                }
            )
        )
        result = temp.loc[temp[StandardProfile.taxonomy_id].notna(), :].copy()
        result[StandardProfile.taxonomy_id] = result[
            StandardProfile.taxonomy_id
        ].astype(int)
        # Replace missing values (unclassified reads) with ID zero and sum reads.
        return pd.concat(
            [
                result,
                pd.DataFrame(
                    {
                        StandardProfile.taxonomy_id: [0],
                        StandardProfile.count: [
                            temp.loc[
                                temp[StandardProfile.taxonomy_id].isna(),
                                StandardProfile.count,
                            ].sum()
                        ],
                    },
                    dtype=int,
                ),
            ],
            ignore_index=True,
        )
Functions
transform(profile: DataFrame[KaijuProfile]) -> DataFrame[StandardProfile] classmethod

Tidy up and standardize a given kaiju profile.

Parameters:

Name Type Description Default
profile DataFrame[KaijuProfile]

A taxonomic profile generated by kaiju.

required

Returns:

Type Description
DataFrame[StandardProfile]

A standardized profile.

Source code in src/taxpasta/infrastructure/application/kaiju/kaiju_profile_standardisation_service.py
@classmethod
@pa.check_types(lazy=True)
def transform(cls, profile: DataFrame[KaijuProfile]) -> DataFrame[StandardProfile]:
    """
    Tidy up and standardize a given kaiju profile.

    Args:
        profile: A taxonomic profile generated by kaiju.

    Returns:
        A standardized profile.

    """
    temp = (
        profile[[KaijuProfile.taxon_id, KaijuProfile.reads]]
        .copy()
        .rename(
            columns={
                KaijuProfile.taxon_id: StandardProfile.taxonomy_id,
                KaijuProfile.reads: StandardProfile.count,
            }
        )
    )
    result = temp.loc[temp[StandardProfile.taxonomy_id].notna(), :].copy()
    result[StandardProfile.taxonomy_id] = result[
        StandardProfile.taxonomy_id
    ].astype(int)
    # Replace missing values (unclassified reads) with ID zero and sum reads.
    return pd.concat(
        [
            result,
            pd.DataFrame(
                {
                    StandardProfile.taxonomy_id: [0],
                    StandardProfile.count: [
                        temp.loc[
                            temp[StandardProfile.taxonomy_id].isna(),
                            StandardProfile.count,
                        ].sum()
                    ],
                },
                dtype=int,
            ),
        ],
        ignore_index=True,
    )
kmcp
Classes
Modules
kmcp_profile

Provide a description of the KMCP profile format.

Attributes
KMCP_PERCENT_TOLERANCE = 1.0 module-attribute
KMCP_PERCENT_TOTAL = 100.0 module-attribute
Classes
KMCPProfile

Bases: BaseDataFrameModel

Define the expected KMCP profile format.

Source code in src/taxpasta/infrastructure/application/kmcp/kmcp_profile.py
class KMCPProfile(BaseDataFrameModel):
    """Define the expected KMCP profile format."""

    reference: Series[str] = pa.Field(alias="ref")
    percentage: Series[float] = pa.Field(ge=0.0, le=100.0)
    coverage: Series[float] = pa.Field(ge=0.0, nullable=True)
    score: Series[float] = pa.Field(ge=0.0, le=100.0)
    chunks_fraction: Series[float] = pa.Field(ge=0.0, le=1.0, alias="chunksFrac")
    chunks_relative_depth: Series[str] = pa.Field(alias="chunksRelDepth")
    chunks_relative_depth_std: Series[float] = pa.Field(
        ge=0.0, nullable=True, alias="chunksRelDepthStd"
    )
    reads: Series[int] = pa.Field(ge=0)
    unique_reads: Series[int] = pa.Field(ge=0, alias="ureads")
    high_confidence_unique_reads: Series[int] = pa.Field(ge=0, alias="hicureads")
    reference_size: Series[int] = pa.Field(ge=0, alias="refsize")
    reference_name: Series[str] = pa.Field(nullable=True, alias="refname")
    taxid: Series[int] = pa.Field(ge=0)
    rank: Series[str] = pa.Field(nullable=True)
    taxonomic_name: Series[str] = pa.Field(nullable=True, alias="taxname")
    taxonomic_path: Series[str] = pa.Field(nullable=True, alias="taxpath")
    taxonomic_path_lineage: Series[str] = pa.Field(nullable=True, alias="taxpathsn")

    @pa.check("percentage", name="compositionality", raise_warning=True)
    def check_compositionality(cls, percentage: Series[float]) -> bool:
        """Check that the percentages add up to a hundred."""
        # KMCP profile reports percentages with sixth decimals
        return percentage.empty or bool(
            np.isclose(
                percentage.sum(), KMCP_PERCENT_TOTAL, atol=KMCP_PERCENT_TOLERANCE
            )
        )
Attributes
chunks_fraction: Series[float] = pa.Field(ge=0.0, le=1.0, alias='chunksFrac') class-attribute instance-attribute
chunks_relative_depth: Series[str] = pa.Field(alias='chunksRelDepth') class-attribute instance-attribute
chunks_relative_depth_std: Series[float] = pa.Field(ge=0.0, nullable=True, alias='chunksRelDepthStd') class-attribute instance-attribute
coverage: Series[float] = pa.Field(ge=0.0, nullable=True) class-attribute instance-attribute
high_confidence_unique_reads: Series[int] = pa.Field(ge=0, alias='hicureads') class-attribute instance-attribute
percentage: Series[float] = pa.Field(ge=0.0, le=100.0) class-attribute instance-attribute
rank: Series[str] = pa.Field(nullable=True) class-attribute instance-attribute
reads: Series[int] = pa.Field(ge=0) class-attribute instance-attribute
reference: Series[str] = pa.Field(alias='ref') class-attribute instance-attribute
reference_name: Series[str] = pa.Field(nullable=True, alias='refname') class-attribute instance-attribute
reference_size: Series[int] = pa.Field(ge=0, alias='refsize') class-attribute instance-attribute
score: Series[float] = pa.Field(ge=0.0, le=100.0) class-attribute instance-attribute
taxid: Series[int] = pa.Field(ge=0) class-attribute instance-attribute
taxonomic_name: Series[str] = pa.Field(nullable=True, alias='taxname') class-attribute instance-attribute
taxonomic_path: Series[str] = pa.Field(nullable=True, alias='taxpath') class-attribute instance-attribute
taxonomic_path_lineage: Series[str] = pa.Field(nullable=True, alias='taxpathsn') class-attribute instance-attribute
unique_reads: Series[int] = pa.Field(ge=0, alias='ureads') class-attribute instance-attribute
Functions
check_compositionality(percentage: Series[float]) -> bool

Check that the percentages add up to a hundred.

Source code in src/taxpasta/infrastructure/application/kmcp/kmcp_profile.py
@pa.check("percentage", name="compositionality", raise_warning=True)
def check_compositionality(cls, percentage: Series[float]) -> bool:
    """Check that the percentages add up to a hundred."""
    # KMCP profile reports percentages with sixth decimals
    return percentage.empty or bool(
        np.isclose(
            percentage.sum(), KMCP_PERCENT_TOTAL, atol=KMCP_PERCENT_TOLERANCE
        )
    )
kmcp_profile_reader

Provide a reader for KMCP profiles.

Attributes Classes
KMCPProfileReader

Bases: ProfileReader

Define a reader for KMCP profiles.

Source code in src/taxpasta/infrastructure/application/kmcp/kmcp_profile_reader.py
class KMCPProfileReader(ProfileReader):
    """Define a reader for KMCP profiles."""

    @classmethod
    @raise_parser_warnings
    def read(cls, profile: BufferOrFilepath) -> DataFrame[KMCPProfile]:
        """
        Read a KMCP taxonomic profile from the given source.

        Args:
            profile: A source that contains a tab-separated taxonomic profile generated
                by KMCP.

        Returns:
            A data frame representation of the KMCP profile.

        """
        result = pd.read_table(
            filepath_or_buffer=profile,
            sep="\t",
            header=0,
            index_col=False,
            dtype={
                KMCPProfile.chunks_relative_depth: str,
            },
        )
        cls._check_num_columns(result, KMCPProfile)
        return result
Functions
read(profile: BufferOrFilepath) -> DataFrame[KMCPProfile] classmethod

Read a KMCP taxonomic profile from the given source.

Parameters:

Name Type Description Default
profile BufferOrFilepath

A source that contains a tab-separated taxonomic profile generated by KMCP.

required

Returns:

Type Description
DataFrame[KMCPProfile]

A data frame representation of the KMCP profile.

Source code in src/taxpasta/infrastructure/application/kmcp/kmcp_profile_reader.py
@classmethod
@raise_parser_warnings
def read(cls, profile: BufferOrFilepath) -> DataFrame[KMCPProfile]:
    """
    Read a KMCP taxonomic profile from the given source.

    Args:
        profile: A source that contains a tab-separated taxonomic profile generated
            by KMCP.

    Returns:
        A data frame representation of the KMCP profile.

    """
    result = pd.read_table(
        filepath_or_buffer=profile,
        sep="\t",
        header=0,
        index_col=False,
        dtype={
            KMCPProfile.chunks_relative_depth: str,
        },
    )
    cls._check_num_columns(result, KMCPProfile)
    return result
Functions
kmcp_profile_standardisation_service

Provide a standardisation service for KMCP profiles.

Attributes
logger = logging.getLogger(__name__) module-attribute
Classes
KMCPProfileStandardisationService

Bases: ProfileStandardisationService

Define a standardisation service for KMCP profiles.

Source code in src/taxpasta/infrastructure/application/kmcp/kmcp_profile_standardisation_service.py
class KMCPProfileStandardisationService(ProfileStandardisationService):
    """Define a standardisation service for KMCP profiles."""

    @classmethod
    @pa.check_types(lazy=True)
    def transform(cls, profile: DataFrame[KMCPProfile]) -> DataFrame[StandardProfile]:
        """
        Tidy up and standardize a given KMCP profile.

        Args:
            profile: A taxonomic profile generated by KMCP.

        Returns:
            A standardized profile.

        """
        temp = (
            profile[[KMCPProfile.taxid, KMCPProfile.reads]]
            .copy()
            .rename(
                columns={
                    KMCPProfile.taxid: StandardProfile.taxonomy_id,
                    KMCPProfile.reads: StandardProfile.count,
                }
            )
        )
        result = temp.loc[temp[StandardProfile.taxonomy_id].notna(), :].copy()
        result[StandardProfile.taxonomy_id] = result[
            StandardProfile.taxonomy_id
        ].astype(int)
        # Replace missing values (unclassified reads) with ID zero and sum reads.
        return pd.concat(
            [
                result,
                pd.DataFrame(
                    {
                        StandardProfile.taxonomy_id: [0],
                        StandardProfile.count: [
                            temp.loc[
                                temp[StandardProfile.taxonomy_id].isna(),
                                StandardProfile.count,
                            ].sum()
                        ],
                    },
                    dtype=int,
                ),
            ],
            ignore_index=True,
        )
Functions
transform(profile: DataFrame[KMCPProfile]) -> DataFrame[StandardProfile] classmethod

Tidy up and standardize a given KMCP profile.

Parameters:

Name Type Description Default
profile DataFrame[KMCPProfile]

A taxonomic profile generated by KMCP.

required

Returns:

Type Description
DataFrame[StandardProfile]

A standardized profile.

Source code in src/taxpasta/infrastructure/application/kmcp/kmcp_profile_standardisation_service.py
@classmethod
@pa.check_types(lazy=True)
def transform(cls, profile: DataFrame[KMCPProfile]) -> DataFrame[StandardProfile]:
    """
    Tidy up and standardize a given KMCP profile.

    Args:
        profile: A taxonomic profile generated by KMCP.

    Returns:
        A standardized profile.

    """
    temp = (
        profile[[KMCPProfile.taxid, KMCPProfile.reads]]
        .copy()
        .rename(
            columns={
                KMCPProfile.taxid: StandardProfile.taxonomy_id,
                KMCPProfile.reads: StandardProfile.count,
            }
        )
    )
    result = temp.loc[temp[StandardProfile.taxonomy_id].notna(), :].copy()
    result[StandardProfile.taxonomy_id] = result[
        StandardProfile.taxonomy_id
    ].astype(int)
    # Replace missing values (unclassified reads) with ID zero and sum reads.
    return pd.concat(
        [
            result,
            pd.DataFrame(
                {
                    StandardProfile.taxonomy_id: [0],
                    StandardProfile.count: [
                        temp.loc[
                            temp[StandardProfile.taxonomy_id].isna(),
                            StandardProfile.count,
                        ].sum()
                    ],
                },
                dtype=int,
            ),
        ],
        ignore_index=True,
    )
kraken2
Classes
Modules
kraken2_profile

Provide a description of the kraken2 profile format.

Attributes
KRAKEN2_PERCENT_TOLERANCE = 1.0 module-attribute
KRAKEN2_PERCENT_TOTAL = 100.0 module-attribute
Classes
Kraken2Profile

Bases: BaseDataFrameModel

Define the expected kraken2 profile format.

Source code in src/taxpasta/infrastructure/application/kraken2/kraken2_profile.py
class Kraken2Profile(BaseDataFrameModel):
    """Define the expected kraken2 profile format."""

    percent: Series[float] = pa.Field(ge=0.0, le=100.0)
    clade_assigned_reads: Series[int] = pa.Field(ge=0)
    direct_assigned_reads: Series[int] = pa.Field(ge=0)
    num_minimizers: Optional[Series[int]] = pa.Field(ge=0)
    distinct_minimizers: Optional[Series[int]] = pa.Field(ge=0)
    taxonomy_lvl: Series[str] = pa.Field()
    taxonomy_id: Series[int] = pa.Field(ge=0)
    name: Series[str] = pa.Field()

    @pa.dataframe_check(name="compositionality", raise_warning=True)
    def check_compositionality(cls, profile: pd.DataFrame) -> bool:
        """Check that the percent of 'unclassified' and 'root' add up to a hundred."""
        # Kraken2 reports percentages only to the second decimal, so we expect
        # some deviation.
        # If 100% of reads are assigned, unclassified reads are not reported at all.
        return profile.empty or bool(
            np.isclose(
                profile.loc[
                    profile[cls.taxonomy_lvl].isin(["U", "R"]), cls.percent
                ].sum(),
                KRAKEN2_PERCENT_TOTAL,
                atol=KRAKEN2_PERCENT_TOLERANCE,
            )
        )
Attributes
clade_assigned_reads: Series[int] = pa.Field(ge=0) class-attribute instance-attribute
direct_assigned_reads: Series[int] = pa.Field(ge=0) class-attribute instance-attribute
distinct_minimizers: Optional[Series[int]] = pa.Field(ge=0) class-attribute instance-attribute
name: Series[str] = pa.Field() class-attribute instance-attribute
num_minimizers: Optional[Series[int]] = pa.Field(ge=0) class-attribute instance-attribute
percent: Series[float] = pa.Field(ge=0.0, le=100.0) class-attribute instance-attribute
taxonomy_id: Series[int] = pa.Field(ge=0) class-attribute instance-attribute
taxonomy_lvl: Series[str] = pa.Field() class-attribute instance-attribute
Functions
check_compositionality(profile: pd.DataFrame) -> bool

Check that the percent of 'unclassified' and 'root' add up to a hundred.

Source code in src/taxpasta/infrastructure/application/kraken2/kraken2_profile.py
@pa.dataframe_check(name="compositionality", raise_warning=True)
def check_compositionality(cls, profile: pd.DataFrame) -> bool:
    """Check that the percent of 'unclassified' and 'root' add up to a hundred."""
    # Kraken2 reports percentages only to the second decimal, so we expect
    # some deviation.
    # If 100% of reads are assigned, unclassified reads are not reported at all.
    return profile.empty or bool(
        np.isclose(
            profile.loc[
                profile[cls.taxonomy_lvl].isin(["U", "R"]), cls.percent
            ].sum(),
            KRAKEN2_PERCENT_TOTAL,
            atol=KRAKEN2_PERCENT_TOLERANCE,
        )
    )
kraken2_profile_reader

Provide a reader for kraken2 profiles.

Attributes Classes
Kraken2ProfileReader

Bases: ProfileReader

Define a reader for kraken2 profiles.

Source code in src/taxpasta/infrastructure/application/kraken2/kraken2_profile_reader.py
class Kraken2ProfileReader(ProfileReader):
    """Define a reader for kraken2 profiles."""

    @classmethod
    @raise_parser_warnings
    def read(cls, profile: BufferOrFilepath) -> DataFrame[Kraken2Profile]:
        """
        Read a kraken2 taxonomic profile from the given source.

        Args:
            profile: A source that contains a tab-separated taxonomic profile generated
                by kraken2.

        Returns:
            A data frame representation of the kraken2 profile.

        Raises:
            ValueError: In case the table does not contain exactly six or eight columns.

        """
        result = pd.read_table(
            filepath_or_buffer=profile,
            sep="\t",
            header=None,
            index_col=False,
            skipinitialspace=True,
        )
        if len(result.columns) == 6:
            result.columns = [
                Kraken2Profile.percent,
                Kraken2Profile.clade_assigned_reads,
                Kraken2Profile.direct_assigned_reads,
                Kraken2Profile.taxonomy_lvl,
                Kraken2Profile.taxonomy_id,
                Kraken2Profile.name,
            ]
        elif len(result.columns) == 8:
            result.columns = [
                Kraken2Profile.percent,
                Kraken2Profile.clade_assigned_reads,
                Kraken2Profile.direct_assigned_reads,
                Kraken2Profile.num_minimizers,
                Kraken2Profile.distinct_minimizers,
                Kraken2Profile.taxonomy_lvl,
                Kraken2Profile.taxonomy_id,
                Kraken2Profile.name,
            ]
        else:
            raise ValueError(
                f"Unexpected kraken2 report format. It has {len(result.columns)} "
                f"columns but only six or eight are expected."
            )
        return result
Functions
read(profile: BufferOrFilepath) -> DataFrame[Kraken2Profile] classmethod

Read a kraken2 taxonomic profile from the given source.

Parameters:

Name Type Description Default
profile BufferOrFilepath

A source that contains a tab-separated taxonomic profile generated by kraken2.

required

Returns:

Type Description
DataFrame[Kraken2Profile]

A data frame representation of the kraken2 profile.

Raises:

Type Description
ValueError

In case the table does not contain exactly six or eight columns.

Source code in src/taxpasta/infrastructure/application/kraken2/kraken2_profile_reader.py
@classmethod
@raise_parser_warnings
def read(cls, profile: BufferOrFilepath) -> DataFrame[Kraken2Profile]:
    """
    Read a kraken2 taxonomic profile from the given source.

    Args:
        profile: A source that contains a tab-separated taxonomic profile generated
            by kraken2.

    Returns:
        A data frame representation of the kraken2 profile.

    Raises:
        ValueError: In case the table does not contain exactly six or eight columns.

    """
    result = pd.read_table(
        filepath_or_buffer=profile,
        sep="\t",
        header=None,
        index_col=False,
        skipinitialspace=True,
    )
    if len(result.columns) == 6:
        result.columns = [
            Kraken2Profile.percent,
            Kraken2Profile.clade_assigned_reads,
            Kraken2Profile.direct_assigned_reads,
            Kraken2Profile.taxonomy_lvl,
            Kraken2Profile.taxonomy_id,
            Kraken2Profile.name,
        ]
    elif len(result.columns) == 8:
        result.columns = [
            Kraken2Profile.percent,
            Kraken2Profile.clade_assigned_reads,
            Kraken2Profile.direct_assigned_reads,
            Kraken2Profile.num_minimizers,
            Kraken2Profile.distinct_minimizers,
            Kraken2Profile.taxonomy_lvl,
            Kraken2Profile.taxonomy_id,
            Kraken2Profile.name,
        ]
    else:
        raise ValueError(
            f"Unexpected kraken2 report format. It has {len(result.columns)} "
            f"columns but only six or eight are expected."
        )
    return result
Functions
kraken2_profile_standardisation_service

Provide a standardisation service for kraken2 profiles.

Classes
Kraken2ProfileStandardisationService

Bases: ProfileStandardisationService

Define a standardisation service for kraken2 profiles.

Source code in src/taxpasta/infrastructure/application/kraken2/kraken2_profile_standardisation_service.py
class Kraken2ProfileStandardisationService(ProfileStandardisationService):
    """Define a standardisation service for kraken2 profiles."""

    @classmethod
    @pa.check_types(lazy=True)
    def transform(
        cls, profile: DataFrame[Kraken2Profile]
    ) -> DataFrame[StandardProfile]:
        """
        Tidy up and standardize a given kraken2 profile.

        Args:
            profile: A taxonomic profile generated by kraken2.

        Returns:
            A standardized profile.

        """
        return (
            profile[[Kraken2Profile.taxonomy_id, Kraken2Profile.direct_assigned_reads]]
            .copy()
            .rename(
                columns={
                    Kraken2Profile.taxonomy_id: StandardProfile.taxonomy_id,
                    Kraken2Profile.direct_assigned_reads: StandardProfile.count,
                }
            )
        )
Functions
transform(profile: DataFrame[Kraken2Profile]) -> DataFrame[StandardProfile] classmethod

Tidy up and standardize a given kraken2 profile.

Parameters:

Name Type Description Default
profile DataFrame[Kraken2Profile]

A taxonomic profile generated by kraken2.

required

Returns:

Type Description
DataFrame[StandardProfile]

A standardized profile.

Source code in src/taxpasta/infrastructure/application/kraken2/kraken2_profile_standardisation_service.py
@classmethod
@pa.check_types(lazy=True)
def transform(
    cls, profile: DataFrame[Kraken2Profile]
) -> DataFrame[StandardProfile]:
    """
    Tidy up and standardize a given kraken2 profile.

    Args:
        profile: A taxonomic profile generated by kraken2.

    Returns:
        A standardized profile.

    """
    return (
        profile[[Kraken2Profile.taxonomy_id, Kraken2Profile.direct_assigned_reads]]
        .copy()
        .rename(
            columns={
                Kraken2Profile.taxonomy_id: StandardProfile.taxonomy_id,
                Kraken2Profile.direct_assigned_reads: StandardProfile.count,
            }
        )
    )
krakenuniq
Classes
Modules
krakenuniq_profile

Provide a description of the KrakenUniq profile format.

Classes
KrakenUniqProfile

Bases: BaseDataFrameModel

Define the expected KrakenUniq profile format.

Source code in src/taxpasta/infrastructure/application/krakenuniq/krakenuniq_profile.py
class KrakenUniqProfile(BaseDataFrameModel):
    """Define the expected KrakenUniq profile format."""

    percent: Series[float] = pa.Field(ge=0.0, le=100.0, alias="%")
    reads: Series[int] = pa.Field(ge=0)
    tax_reads: Series[int] = pa.Field(ge=0, alias="taxReads")
    kmers: Series[int] = pa.Field(ge=0)
    duplicates: Series[float] = pa.Field(ge=0.0, alias="dup")
    coverage: Series[float] = pa.Field(ge=0.0, nullable=True, alias="cov")
    tax_id: Series[int] = pa.Field(alias="taxID", ge=0)
    rank: Series[str] = pa.Field()
    tax_name: Series[str] = pa.Field(alias="taxName")
Attributes
coverage: Series[float] = pa.Field(ge=0.0, nullable=True, alias='cov') class-attribute instance-attribute
duplicates: Series[float] = pa.Field(ge=0.0, alias='dup') class-attribute instance-attribute
kmers: Series[int] = pa.Field(ge=0) class-attribute instance-attribute
percent: Series[float] = pa.Field(ge=0.0, le=100.0, alias='%') class-attribute instance-attribute
rank: Series[str] = pa.Field() class-attribute instance-attribute
reads: Series[int] = pa.Field(ge=0) class-attribute instance-attribute
tax_id: Series[int] = pa.Field(alias='taxID', ge=0) class-attribute instance-attribute
tax_name: Series[str] = pa.Field(alias='taxName') class-attribute instance-attribute
tax_reads: Series[int] = pa.Field(ge=0, alias='taxReads') class-attribute instance-attribute
krakenuniq_profile_reader

Provide a reader for KrakenUniq profiles.

Attributes Classes
KrakenUniqProfileReader

Bases: ProfileReader

Define a reader for KrakenUniq profiles.

Source code in src/taxpasta/infrastructure/application/krakenuniq/krakenuniq_profile_reader.py
class KrakenUniqProfileReader(ProfileReader):
    """Define a reader for KrakenUniq profiles."""

    @classmethod
    @raise_parser_warnings
    def read(cls, profile: BufferOrFilepath) -> DataFrame[KrakenUniqProfile]:
        """
        Read a krakenUniq taxonomic profile from the given source.

        Args:
            profile: A source that contains a tab-separated taxonomic profile generated
                by KrakenUniq.

        Returns:
            A data frame representation of the KrakenUniq profile.

        """
        result = pd.read_table(
            filepath_or_buffer=profile,
            sep="\t",
            skiprows=2,
            header=0,
            index_col=False,
            skipinitialspace=True,
            dtype={
                KrakenUniqProfile.percent: float,
                KrakenUniqProfile.duplicates: float,
                KrakenUniqProfile.coverage: float,
            },
        )
        cls._check_num_columns(result, KrakenUniqProfile)
        return result
Functions
read(profile: BufferOrFilepath) -> DataFrame[KrakenUniqProfile] classmethod

Read a krakenUniq taxonomic profile from the given source.

Parameters:

Name Type Description Default
profile BufferOrFilepath

A source that contains a tab-separated taxonomic profile generated by KrakenUniq.

required

Returns:

Type Description
DataFrame[KrakenUniqProfile]

A data frame representation of the KrakenUniq profile.

Source code in src/taxpasta/infrastructure/application/krakenuniq/krakenuniq_profile_reader.py
@classmethod
@raise_parser_warnings
def read(cls, profile: BufferOrFilepath) -> DataFrame[KrakenUniqProfile]:
    """
    Read a krakenUniq taxonomic profile from the given source.

    Args:
        profile: A source that contains a tab-separated taxonomic profile generated
            by KrakenUniq.

    Returns:
        A data frame representation of the KrakenUniq profile.

    """
    result = pd.read_table(
        filepath_or_buffer=profile,
        sep="\t",
        skiprows=2,
        header=0,
        index_col=False,
        skipinitialspace=True,
        dtype={
            KrakenUniqProfile.percent: float,
            KrakenUniqProfile.duplicates: float,
            KrakenUniqProfile.coverage: float,
        },
    )
    cls._check_num_columns(result, KrakenUniqProfile)
    return result
Functions
krakenuniq_profile_standardisation_service

Provide a standardisation service for KrakenUniq profiles.

Classes
KrakenUniqProfileStandardisationService

Bases: ProfileStandardisationService

Define a standardisation service for krakenUniq profiles.

Source code in src/taxpasta/infrastructure/application/krakenuniq/krakenuniq_profile_standardisation_service.py
class KrakenUniqProfileStandardisationService(ProfileStandardisationService):
    """Define a standardisation service for krakenUniq profiles."""

    @classmethod
    @pa.check_types(lazy=True)
    def transform(
        cls, profile: DataFrame[KrakenUniqProfile]
    ) -> DataFrame[StandardProfile]:
        """
        Tidy up and standardize a given krakenUniq profile.

        Args:
            profile: A taxonomic profile generated by KrakenUniq.

        Returns:
            A standardized profile.

        """
        return (
            profile[[KrakenUniqProfile.tax_id, KrakenUniqProfile.tax_reads]]
            .copy()
            .rename(
                columns={
                    KrakenUniqProfile.tax_id: StandardProfile.taxonomy_id,
                    KrakenUniqProfile.tax_reads: StandardProfile.count,
                }
            )
        )
Functions
transform(profile: DataFrame[KrakenUniqProfile]) -> DataFrame[StandardProfile] classmethod

Tidy up and standardize a given krakenUniq profile.

Parameters:

Name Type Description Default
profile DataFrame[KrakenUniqProfile]

A taxonomic profile generated by KrakenUniq.

required

Returns:

Type Description
DataFrame[StandardProfile]

A standardized profile.

Source code in src/taxpasta/infrastructure/application/krakenuniq/krakenuniq_profile_standardisation_service.py
@classmethod
@pa.check_types(lazy=True)
def transform(
    cls, profile: DataFrame[KrakenUniqProfile]
) -> DataFrame[StandardProfile]:
    """
    Tidy up and standardize a given krakenUniq profile.

    Args:
        profile: A taxonomic profile generated by KrakenUniq.

    Returns:
        A standardized profile.

    """
    return (
        profile[[KrakenUniqProfile.tax_id, KrakenUniqProfile.tax_reads]]
        .copy()
        .rename(
            columns={
                KrakenUniqProfile.tax_id: StandardProfile.taxonomy_id,
                KrakenUniqProfile.tax_reads: StandardProfile.count,
            }
        )
    )
megan6
Classes
Modules
megan6_profile

Provide a description of the MEGAN6 rma2info profile format.

Classes
Megan6Profile

Bases: BaseDataFrameModel

Define the expected MEGAN6 rma2info profile format.

Source code in src/taxpasta/infrastructure/application/megan6/megan6_profile.py
class Megan6Profile(BaseDataFrameModel):
    """Define the expected MEGAN6 rma2info profile format."""

    taxonomy_id: Series[int] = pa.Field(ge=0)
    count: Series[float] = pa.Field(ge=0.0)
Attributes
count: Series[float] = pa.Field(ge=0.0) class-attribute instance-attribute
taxonomy_id: Series[int] = pa.Field(ge=0) class-attribute instance-attribute
megan6_profile_reader

Provide a reader for megan6 profiles.

Attributes Classes
Megan6ProfileReader

Bases: ProfileReader

Define a reader for MEGAN6 rma2info profiles.

Source code in src/taxpasta/infrastructure/application/megan6/megan6_profile_reader.py
class Megan6ProfileReader(ProfileReader):
    """Define a reader for MEGAN6 rma2info profiles."""

    @classmethod
    @raise_parser_warnings
    def read(cls, profile: BufferOrFilepath) -> DataFrame[Megan6Profile]:
        """Read a MEGAN6 rma2info taxonomic profile from a file."""
        result = pd.read_table(
            filepath_or_buffer=profile,
            sep="\t",
            index_col=False,
            header=None,
        )
        cls._check_num_columns(result, Megan6Profile)
        result.columns = [Megan6Profile.taxonomy_id, Megan6Profile.count]
        return result
Functions
read(profile: BufferOrFilepath) -> DataFrame[Megan6Profile] classmethod

Read a MEGAN6 rma2info taxonomic profile from a file.

Source code in src/taxpasta/infrastructure/application/megan6/megan6_profile_reader.py
@classmethod
@raise_parser_warnings
def read(cls, profile: BufferOrFilepath) -> DataFrame[Megan6Profile]:
    """Read a MEGAN6 rma2info taxonomic profile from a file."""
    result = pd.read_table(
        filepath_or_buffer=profile,
        sep="\t",
        index_col=False,
        header=None,
    )
    cls._check_num_columns(result, Megan6Profile)
    result.columns = [Megan6Profile.taxonomy_id, Megan6Profile.count]
    return result
Functions
megan6_profile_standardisation_service

Provide a standardisation service for megan6 profiles.

Classes
Megan6ProfileStandardisationService

Bases: ProfileStandardisationService

Define a standardisation service for megan6 profiles.

Source code in src/taxpasta/infrastructure/application/megan6/megan6_profile_standardisation_service.py
class Megan6ProfileStandardisationService(ProfileStandardisationService):
    """Define a standardisation service for megan6 profiles."""

    @classmethod
    @pa.check_types(lazy=True)
    def transform(cls, profile: DataFrame[Megan6Profile]) -> DataFrame[StandardProfile]:
        """
        Tidy up and standardize a given MEGAN6 rma2info profile.

        Args:
            profile: A taxonomic profile generated by MEGAN6 rma2info.

        Returns:
            A standardized profile.

        """
        return (
            profile[[Megan6Profile.taxonomy_id, Megan6Profile.count]]
            .copy()
            .rename(
                columns={
                    Megan6Profile.taxonomy_id: StandardProfile.taxonomy_id,
                    Megan6Profile.count: StandardProfile.count,
                }
            )
        )
Functions
transform(profile: DataFrame[Megan6Profile]) -> DataFrame[StandardProfile] classmethod

Tidy up and standardize a given MEGAN6 rma2info profile.

Parameters:

Name Type Description Default
profile DataFrame[Megan6Profile]

A taxonomic profile generated by MEGAN6 rma2info.

required

Returns:

Type Description
DataFrame[StandardProfile]

A standardized profile.

Source code in src/taxpasta/infrastructure/application/megan6/megan6_profile_standardisation_service.py
@classmethod
@pa.check_types(lazy=True)
def transform(cls, profile: DataFrame[Megan6Profile]) -> DataFrame[StandardProfile]:
    """
    Tidy up and standardize a given MEGAN6 rma2info profile.

    Args:
        profile: A taxonomic profile generated by MEGAN6 rma2info.

    Returns:
        A standardized profile.

    """
    return (
        profile[[Megan6Profile.taxonomy_id, Megan6Profile.count]]
        .copy()
        .rename(
            columns={
                Megan6Profile.taxonomy_id: StandardProfile.taxonomy_id,
                Megan6Profile.count: StandardProfile.count,
            }
        )
    )
metaphlan
Classes
Modules
metaphlan_profile

Provide a description of the metaphlan profile format.

Attributes
METAPHLAN_PERCENT_TOLERANCE = 1.0 module-attribute
METAPHLAN_PERCENT_TOTAL = 100.0 module-attribute
Classes
MetaphlanProfile

Bases: BaseDataFrameModel

Define the expected metaphlan profile format.

Source code in src/taxpasta/infrastructure/application/metaphlan/metaphlan_profile.py
class MetaphlanProfile(BaseDataFrameModel):
    """Define the expected metaphlan profile format."""

    clade_name: Series[str] = pa.Field()
    # MetaPhlan provides the full lineage of tax IDs in this field.
    ncbi_tax_id: Series[str] = pa.Field(alias="NCBI_tax_id")
    relative_abundance: Series[float] = pa.Field(ge=0.0, le=100.0)
    additional_species: Optional[Series[str]] = pa.Field(nullable=True)

    @pa.dataframe_check(name="compositionality", raise_warning=True)
    def check_compositionality(cls, profile: pd.DataFrame) -> bool:
        """Check that the percentages per rank add up to a hundred."""
        # Parse the rank from the given lineage.
        rank = profile[cls.clade_name].str.rsplit("|", n=1).str[-1].str[0]
        return profile.empty or bool(
            np.allclose(
                profile.groupby(rank, sort=False)[cls.relative_abundance].sum(),
                METAPHLAN_PERCENT_TOTAL,
                atol=METAPHLAN_PERCENT_TOLERANCE,
            )
        )
Attributes
additional_species: Optional[Series[str]] = pa.Field(nullable=True) class-attribute instance-attribute
clade_name: Series[str] = pa.Field() class-attribute instance-attribute
ncbi_tax_id: Series[str] = pa.Field(alias='NCBI_tax_id') class-attribute instance-attribute
relative_abundance: Series[float] = pa.Field(ge=0.0, le=100.0) class-attribute instance-attribute
Functions
check_compositionality(profile: pd.DataFrame) -> bool

Check that the percentages per rank add up to a hundred.

Source code in src/taxpasta/infrastructure/application/metaphlan/metaphlan_profile.py
@pa.dataframe_check(name="compositionality", raise_warning=True)
def check_compositionality(cls, profile: pd.DataFrame) -> bool:
    """Check that the percentages per rank add up to a hundred."""
    # Parse the rank from the given lineage.
    rank = profile[cls.clade_name].str.rsplit("|", n=1).str[-1].str[0]
    return profile.empty or bool(
        np.allclose(
            profile.groupby(rank, sort=False)[cls.relative_abundance].sum(),
            METAPHLAN_PERCENT_TOTAL,
            atol=METAPHLAN_PERCENT_TOLERANCE,
        )
    )
metaphlan_profile_reader

Provide a reader for metaphlan profiles.

Attributes Classes
MetaphlanProfileReader

Bases: ProfileReader

Define a reader for Metaphlan profiles.

Source code in src/taxpasta/infrastructure/application/metaphlan/metaphlan_profile_reader.py
class MetaphlanProfileReader(ProfileReader):
    """Define a reader for Metaphlan profiles."""

    @classmethod
    @raise_parser_warnings
    def read(cls, profile: BufferOrFilepath) -> DataFrame[MetaphlanProfile]:
        """Read a metaphlan taxonomic profile from a file."""
        num_header_lines = cls._detect_number_header_line(profile)
        result = pd.read_table(
            filepath_or_buffer=profile,
            sep="\t",
            skiprows=num_header_lines,
            header=None,
            index_col=False,
            dtype={1: str},
        )
        cls._check_num_columns(result, MetaphlanProfile)
        result.columns = [
            MetaphlanProfile.clade_name,
            MetaphlanProfile.ncbi_tax_id,
            MetaphlanProfile.relative_abundance,
            MetaphlanProfile.additional_species,
        ]
        return result

    @classmethod
    def _detect_number_header_line(cls, profile: BufferOrFilepath) -> int:
        """
        Detect the number of comment lines in the header of a MetaPhlAn profile.

        The number of lines varies at least between versions 3 & 4.

        """
        if isinstance(profile, BinaryIO):
            # We assume default file encoding here (UTF-8 in most environments).
            result = cls._detect_first_content_line(buffer=TextIOWrapper(profile))
            profile.seek(0)
            return result
        elif isinstance(profile, TextIO):
            result = cls._detect_first_content_line(buffer=profile)
            profile.seek(0)
            return result
        else:
            with Path(profile).open(mode="r") as handle:
                return cls._detect_first_content_line(buffer=handle)

    @classmethod
    def _detect_first_content_line(
        cls, buffer: TextIO, comment_marker: str = "#", max_lines: int = 10
    ) -> int:
        """Detect the first non-comment line in the given text buffer."""
        for num, line in enumerate(buffer):
            if not line.startswith(comment_marker):
                return num
            if num >= max_lines:
                raise ValueError(
                    "Unexpectedly large number of comment lines in MetaPhlAn "
                    "profile (>10)."
                )
        else:
            raise ValueError("Could not detect any content lines in MetaPhlAn profile.")
Functions
read(profile: BufferOrFilepath) -> DataFrame[MetaphlanProfile] classmethod

Read a metaphlan taxonomic profile from a file.

Source code in src/taxpasta/infrastructure/application/metaphlan/metaphlan_profile_reader.py
@classmethod
@raise_parser_warnings
def read(cls, profile: BufferOrFilepath) -> DataFrame[MetaphlanProfile]:
    """Read a metaphlan taxonomic profile from a file."""
    num_header_lines = cls._detect_number_header_line(profile)
    result = pd.read_table(
        filepath_or_buffer=profile,
        sep="\t",
        skiprows=num_header_lines,
        header=None,
        index_col=False,
        dtype={1: str},
    )
    cls._check_num_columns(result, MetaphlanProfile)
    result.columns = [
        MetaphlanProfile.clade_name,
        MetaphlanProfile.ncbi_tax_id,
        MetaphlanProfile.relative_abundance,
        MetaphlanProfile.additional_species,
    ]
    return result
Functions
metaphlan_profile_standardisation_service

Provide a standardisation service for metaphlan profiles.

Attributes
logger = logging.getLogger(__name__) module-attribute
Classes
MetaphlanProfileStandardisationService

Bases: ProfileStandardisationService

Define a standardisation service for metaphlan profiles.

Source code in src/taxpasta/infrastructure/application/metaphlan/metaphlan_profile_standardisation_service.py
class MetaphlanProfileStandardisationService(ProfileStandardisationService):
    """Define a standardisation service for metaphlan profiles."""

    # Metaphlan only reports up to six decimals so this number should be large enough.
    LARGE_INTEGER = 1_000_000

    @classmethod
    @pa.check_types(lazy=True)
    def transform(
        cls, profile: DataFrame[MetaphlanProfile]
    ) -> DataFrame[StandardProfile]:
        """
        Tidy up and standardize a given metaphlan profile.

        Args:
            profile: A taxonomic profile generated by metaphlan.

        Returns:
            A standardized profile.

        """
        result = (
            profile[[MetaphlanProfile.ncbi_tax_id, MetaphlanProfile.relative_abundance]]
            .copy()
            .rename(
                columns={
                    MetaphlanProfile.ncbi_tax_id: StandardProfile.taxonomy_id,
                    MetaphlanProfile.relative_abundance: StandardProfile.count,
                }
            )
            .assign(
                **{
                    StandardProfile.taxonomy_id: lambda df: df[
                        StandardProfile.taxonomy_id
                    ]
                    .str.rsplit("|", n=1)
                    .str[-1],
                    StandardProfile.count: lambda df: df[StandardProfile.count]
                    * cls.LARGE_INTEGER,
                }
            )
            .assign(
                **{
                    StandardProfile.count: lambda df: df[StandardProfile.count].astype(
                        int
                    )
                }
            )
        )
        result[StandardProfile.taxonomy_id] = pd.to_numeric(
            result[StandardProfile.taxonomy_id], errors="coerce"
        ).astype("Int64")
        unclassified_mask = result[StandardProfile.taxonomy_id].isna() | (
            result[StandardProfile.taxonomy_id] == -1
        )
        num = int(unclassified_mask.sum())
        if num > 0:
            logger.warning(
                "Combining %d entries with unclassified taxa in the profile.", num
            )
        return pd.concat(
            [
                result.loc[~unclassified_mask, :],
                pd.DataFrame(
                    {
                        StandardProfile.taxonomy_id: [0],
                        StandardProfile.count: [
                            result.loc[unclassified_mask, StandardProfile.count].sum()
                        ],
                    },
                    dtype=int,
                ),
            ],
            ignore_index=True,
        )
Attributes
LARGE_INTEGER = 1000000 class-attribute instance-attribute
Functions
transform(profile: DataFrame[MetaphlanProfile]) -> DataFrame[StandardProfile] classmethod

Tidy up and standardize a given metaphlan profile.

Parameters:

Name Type Description Default
profile DataFrame[MetaphlanProfile]

A taxonomic profile generated by metaphlan.

required

Returns:

Type Description
DataFrame[StandardProfile]

A standardized profile.

Source code in src/taxpasta/infrastructure/application/metaphlan/metaphlan_profile_standardisation_service.py
@classmethod
@pa.check_types(lazy=True)
def transform(
    cls, profile: DataFrame[MetaphlanProfile]
) -> DataFrame[StandardProfile]:
    """
    Tidy up and standardize a given metaphlan profile.

    Args:
        profile: A taxonomic profile generated by metaphlan.

    Returns:
        A standardized profile.

    """
    result = (
        profile[[MetaphlanProfile.ncbi_tax_id, MetaphlanProfile.relative_abundance]]
        .copy()
        .rename(
            columns={
                MetaphlanProfile.ncbi_tax_id: StandardProfile.taxonomy_id,
                MetaphlanProfile.relative_abundance: StandardProfile.count,
            }
        )
        .assign(
            **{
                StandardProfile.taxonomy_id: lambda df: df[
                    StandardProfile.taxonomy_id
                ]
                .str.rsplit("|", n=1)
                .str[-1],
                StandardProfile.count: lambda df: df[StandardProfile.count]
                * cls.LARGE_INTEGER,
            }
        )
        .assign(
            **{
                StandardProfile.count: lambda df: df[StandardProfile.count].astype(
                    int
                )
            }
        )
    )
    result[StandardProfile.taxonomy_id] = pd.to_numeric(
        result[StandardProfile.taxonomy_id], errors="coerce"
    ).astype("Int64")
    unclassified_mask = result[StandardProfile.taxonomy_id].isna() | (
        result[StandardProfile.taxonomy_id] == -1
    )
    num = int(unclassified_mask.sum())
    if num > 0:
        logger.warning(
            "Combining %d entries with unclassified taxa in the profile.", num
        )
    return pd.concat(
        [
            result.loc[~unclassified_mask, :],
            pd.DataFrame(
                {
                    StandardProfile.taxonomy_id: [0],
                    StandardProfile.count: [
                        result.loc[unclassified_mask, StandardProfile.count].sum()
                    ],
                },
                dtype=int,
            ),
        ],
        ignore_index=True,
    )
motus
Classes
Modules
motus_profile

Provide a description of the mOTUs profile format.

Classes
MotusProfile

Bases: BaseDataFrameModel

Define the expected mOTUs profile format.

Source code in src/taxpasta/infrastructure/application/motus/motus_profile.py
class MotusProfile(BaseDataFrameModel):
    """Define the expected mOTUs profile format."""

    consensus_taxonomy: Series[str] = pa.Field()
    ncbi_tax_id: Series[pd.Int64Dtype] = pa.Field(nullable=True)
    read_count: Series[int] = pa.Field(ge=0)
Attributes
consensus_taxonomy: Series[str] = pa.Field() class-attribute instance-attribute
ncbi_tax_id: Series[pd.Int64Dtype] = pa.Field(nullable=True) class-attribute instance-attribute
read_count: Series[int] = pa.Field(ge=0) class-attribute instance-attribute
motus_profile_reader

Provide a reader for motus profiles.

Attributes Classes
MotusProfileReader

Bases: ProfileReader

Define a reader for mOTUS profiles.

Source code in src/taxpasta/infrastructure/application/motus/motus_profile_reader.py
class MotusProfileReader(ProfileReader):
    """Define a reader for mOTUS profiles."""

    @classmethod
    @raise_parser_warnings
    def read(cls, profile: BufferOrFilepath) -> DataFrame[MotusProfile]:
        """Read a mOTUs taxonomic profile from a file."""
        result = pd.read_table(
            filepath_or_buffer=profile,
            sep="\t",
            skiprows=3,
            header=None,
            index_col=False,
            dtype={1: "Int64"},
        )
        cls._check_num_columns(result, MotusProfile)
        result.columns = [
            MotusProfile.consensus_taxonomy,
            MotusProfile.ncbi_tax_id,
            MotusProfile.read_count,
        ]
        return result
Functions
read(profile: BufferOrFilepath) -> DataFrame[MotusProfile] classmethod

Read a mOTUs taxonomic profile from a file.

Source code in src/taxpasta/infrastructure/application/motus/motus_profile_reader.py
@classmethod
@raise_parser_warnings
def read(cls, profile: BufferOrFilepath) -> DataFrame[MotusProfile]:
    """Read a mOTUs taxonomic profile from a file."""
    result = pd.read_table(
        filepath_or_buffer=profile,
        sep="\t",
        skiprows=3,
        header=None,
        index_col=False,
        dtype={1: "Int64"},
    )
    cls._check_num_columns(result, MotusProfile)
    result.columns = [
        MotusProfile.consensus_taxonomy,
        MotusProfile.ncbi_tax_id,
        MotusProfile.read_count,
    ]
    return result
Functions
motus_profile_standardisation_service

Provide a standardisation service for mOTUs profiles.

Classes
MotusProfileStandardisationService

Bases: ProfileStandardisationService

Define a standardisation service for mOTUs profiles.

Source code in src/taxpasta/infrastructure/application/motus/motus_profile_standardisation_service.py
class MotusProfileStandardisationService(ProfileStandardisationService):
    """Define a standardisation service for mOTUs profiles."""

    @classmethod
    @pa.check_types(lazy=True)
    def transform(cls, profile: DataFrame[MotusProfile]) -> DataFrame[StandardProfile]:
        """
        Tidy up and standardize a given mOTUs profile.

        Args:
            profile: A taxonomic profile generated by mOTUs.

        Returns:
            A standardized profile.

        """
        temp = (
            profile.loc[
                # Ignore entries with zero read count.
                profile[MotusProfile.read_count] > 0,
                [MotusProfile.ncbi_tax_id, MotusProfile.read_count],
            ]
            .copy()
            .rename(
                columns={
                    MotusProfile.ncbi_tax_id: StandardProfile.taxonomy_id,
                    MotusProfile.read_count: StandardProfile.count,
                }
            )
        )
        # Split profile into entries with known and unknown tax ID.
        result = (
            temp.loc[temp[StandardProfile.taxonomy_id].notna(), :]
            .copy()
            .assign(
                **{
                    StandardProfile.taxonomy_id: lambda df: df[
                        StandardProfile.taxonomy_id
                    ].astype(int)
                }
            )
            # FIXME (Moritz): Apparently, mOTUs profiles can contain duplicate tax IDs.
            #  Clarify with Sofia and Maxime. For now, sum up read counts.
            #  https://github.com/taxprofiler/taxpasta/issues/46
            .groupby(StandardProfile.taxonomy_id, as_index=False, sort=False)
            .sum()
        )
        # Sum up all remaining read counts without tax ID to be 'unassigned'.
        return pd.concat(
            [
                result,
                pd.DataFrame(
                    {
                        StandardProfile.taxonomy_id: [0],
                        StandardProfile.count: [
                            temp.loc[
                                temp[StandardProfile.taxonomy_id].isna(),
                                StandardProfile.count,
                            ].sum()
                        ],
                    },
                    dtype=int,
                ),
            ],
            ignore_index=True,
        )
Functions
transform(profile: DataFrame[MotusProfile]) -> DataFrame[StandardProfile] classmethod

Tidy up and standardize a given mOTUs profile.

Parameters:

Name Type Description Default
profile DataFrame[MotusProfile]

A taxonomic profile generated by mOTUs.

required

Returns:

Type Description
DataFrame[StandardProfile]

A standardized profile.

Source code in src/taxpasta/infrastructure/application/motus/motus_profile_standardisation_service.py
@classmethod
@pa.check_types(lazy=True)
def transform(cls, profile: DataFrame[MotusProfile]) -> DataFrame[StandardProfile]:
    """
    Tidy up and standardize a given mOTUs profile.

    Args:
        profile: A taxonomic profile generated by mOTUs.

    Returns:
        A standardized profile.

    """
    temp = (
        profile.loc[
            # Ignore entries with zero read count.
            profile[MotusProfile.read_count] > 0,
            [MotusProfile.ncbi_tax_id, MotusProfile.read_count],
        ]
        .copy()
        .rename(
            columns={
                MotusProfile.ncbi_tax_id: StandardProfile.taxonomy_id,
                MotusProfile.read_count: StandardProfile.count,
            }
        )
    )
    # Split profile into entries with known and unknown tax ID.
    result = (
        temp.loc[temp[StandardProfile.taxonomy_id].notna(), :]
        .copy()
        .assign(
            **{
                StandardProfile.taxonomy_id: lambda df: df[
                    StandardProfile.taxonomy_id
                ].astype(int)
            }
        )
        # FIXME (Moritz): Apparently, mOTUs profiles can contain duplicate tax IDs.
        #  Clarify with Sofia and Maxime. For now, sum up read counts.
        #  https://github.com/taxprofiler/taxpasta/issues/46
        .groupby(StandardProfile.taxonomy_id, as_index=False, sort=False)
        .sum()
    )
    # Sum up all remaining read counts without tax ID to be 'unassigned'.
    return pd.concat(
        [
            result,
            pd.DataFrame(
                {
                    StandardProfile.taxonomy_id: [0],
                    StandardProfile.count: [
                        temp.loc[
                            temp[StandardProfile.taxonomy_id].isna(),
                            StandardProfile.count,
                        ].sum()
                    ],
                },
                dtype=int,
            ),
        ],
        ignore_index=True,
    )
sample_sheet

Provide a description of samples and profile locations.

Classes
SampleSheet

Bases: DataFrameModel

Define a description of samples and profile locations.

Source code in src/taxpasta/infrastructure/application/sample_sheet.py
class SampleSheet(pa.DataFrameModel):
    """Define a description of samples and profile locations."""

    sample: Series[str] = pa.Field()
    profile: Series[str] = pa.Field()  # type: ignore

    @pa.dataframe_check
    @classmethod
    def check_number_samples(cls, table: DataFrame) -> bool:
        """Check that there are at least two samples."""
        return (table[cls.sample].notnull() & table[cls.profile].notnull()).sum() > 1

    @pa.check("profile", name="profile_presence")
    @classmethod
    def check_profile_presence(
        cls, profile: Series[str]  # type: ignore
    ) -> Series[bool]:
        """Check that every profile is present at the specified location."""
        return cast(Series[bool], profile.map(lambda path: Path(path).is_file()))

    class Config:
        """Configure the schema model."""

        coerce = True
        ordered = True
        strict = True
Attributes
profile: Series[str] = pa.Field() class-attribute instance-attribute
sample: Series[str] = pa.Field() class-attribute instance-attribute
Classes
Config

Configure the schema model.

Source code in src/taxpasta/infrastructure/application/sample_sheet.py
class Config:
    """Configure the schema model."""

    coerce = True
    ordered = True
    strict = True
Attributes
coerce = True class-attribute instance-attribute
ordered = True class-attribute instance-attribute
strict = True class-attribute instance-attribute
Functions
check_number_samples(table: DataFrame) -> bool classmethod

Check that there are at least two samples.

Source code in src/taxpasta/infrastructure/application/sample_sheet.py
@pa.dataframe_check
@classmethod
def check_number_samples(cls, table: DataFrame) -> bool:
    """Check that there are at least two samples."""
    return (table[cls.sample].notnull() & table[cls.profile].notnull()).sum() > 1
check_profile_presence(profile: Series[str]) -> Series[bool] classmethod

Check that every profile is present at the specified location.

Source code in src/taxpasta/infrastructure/application/sample_sheet.py
@pa.check("profile", name="profile_presence")
@classmethod
def check_profile_presence(
    cls, profile: Series[str]  # type: ignore
) -> Series[bool]:
    """Check that every profile is present at the specified location."""
    return cast(Series[bool], profile.map(lambda path: Path(path).is_file()))
standard_profile_file_format

Provide a service for supported tabular file formats.

Classes
StandardProfileFileFormat

Bases: str, DependencyCheckMixin, Enum

Define the supported standardized profile file formats.

Source code in src/taxpasta/infrastructure/application/standard_profile_file_format.py
@unique
class StandardProfileFileFormat(str, DependencyCheckMixin, Enum):
    """Define the supported standardized profile file formats."""

    TSV = "TSV"
    CSV = "CSV"
    ODS = "ODS"
    XLSX = "XLSX"
    arrow = "arrow"
    parquet = "parquet"
Attributes
CSV = 'CSV' class-attribute instance-attribute
ODS = 'ODS' class-attribute instance-attribute
TSV = 'TSV' class-attribute instance-attribute
XLSX = 'XLSX' class-attribute instance-attribute
arrow = 'arrow' class-attribute instance-attribute
parquet = 'parquet' class-attribute instance-attribute
standard_profile_writer
Modules
arrow_standard_profile_writer

Provide an arrow writer.

Attributes Classes
ArrowStandardProfileWriter

Bases: StandardProfileWriter

Define the arrow writer.

Source code in src/taxpasta/infrastructure/application/standard_profile_writer/arrow_standard_profile_writer.py
class ArrowStandardProfileWriter(StandardProfileWriter):
    """Define the arrow writer."""

    @classmethod
    def write(
        cls,
        profile: DataFrame[StandardProfile],
        target: BinaryBufferOrFilepath,
        **kwargs,
    ) -> None:
        """Write the given standardized profile to the given buffer or file."""
        profile.to_feather(target, **kwargs)
Functions
write(profile: DataFrame[StandardProfile], target: BinaryBufferOrFilepath, **kwargs) -> None classmethod

Write the given standardized profile to the given buffer or file.

Source code in src/taxpasta/infrastructure/application/standard_profile_writer/arrow_standard_profile_writer.py
@classmethod
def write(
    cls,
    profile: DataFrame[StandardProfile],
    target: BinaryBufferOrFilepath,
    **kwargs,
) -> None:
    """Write the given standardized profile to the given buffer or file."""
    profile.to_feather(target, **kwargs)
csv_standard_profile_writer

Provide a CSV writer.

Attributes Classes
CSVStandardProfileWriter

Bases: StandardProfileWriter

Define the CSV writer.

Source code in src/taxpasta/infrastructure/application/standard_profile_writer/csv_standard_profile_writer.py
class CSVStandardProfileWriter(StandardProfileWriter):
    """Define the CSV writer."""

    @classmethod
    def write(
        cls, profile: DataFrame[StandardProfile], target: BufferOrFilepath, **kwargs
    ) -> None:
        """Write the given standardized profile to the given buffer or file."""
        profile.to_csv(target, index=False, **kwargs)
Functions
write(profile: DataFrame[StandardProfile], target: BufferOrFilepath, **kwargs) -> None classmethod

Write the given standardized profile to the given buffer or file.

Source code in src/taxpasta/infrastructure/application/standard_profile_writer/csv_standard_profile_writer.py
@classmethod
def write(
    cls, profile: DataFrame[StandardProfile], target: BufferOrFilepath, **kwargs
) -> None:
    """Write the given standardized profile to the given buffer or file."""
    profile.to_csv(target, index=False, **kwargs)
ods_standard_profile_writer

Provide an ODS writer.

Attributes Classes
ODSStandardProfileWriter

Bases: StandardProfileWriter

Define the ODS writer.

Source code in src/taxpasta/infrastructure/application/standard_profile_writer/ods_standard_profile_writer.py
class ODSStandardProfileWriter(StandardProfileWriter):
    """Define the ODS writer."""

    @classmethod
    def write(
        cls,
        profile: DataFrame[StandardProfile],
        target: BinaryBufferOrFilepath,
        **kwargs,
    ) -> None:
        """Write the given standardized profile to the given buffer or file."""
        profile.to_excel(target, index=False, engine="odf", **kwargs)
Functions
write(profile: DataFrame[StandardProfile], target: BinaryBufferOrFilepath, **kwargs) -> None classmethod

Write the given standardized profile to the given buffer or file.

Source code in src/taxpasta/infrastructure/application/standard_profile_writer/ods_standard_profile_writer.py
@classmethod
def write(
    cls,
    profile: DataFrame[StandardProfile],
    target: BinaryBufferOrFilepath,
    **kwargs,
) -> None:
    """Write the given standardized profile to the given buffer or file."""
    profile.to_excel(target, index=False, engine="odf", **kwargs)
parquet_standard_profile_writer

Provide an parquet writer.

Attributes Classes
ParquetStandardProfileWriter

Bases: StandardProfileWriter

Define the parquet writer.

Source code in src/taxpasta/infrastructure/application/standard_profile_writer/parquet_standard_profile_writer.py
class ParquetStandardProfileWriter(StandardProfileWriter):
    """Define the parquet writer."""

    @classmethod
    def write(
        cls,
        profile: DataFrame[StandardProfile],
        target: BinaryBufferOrFilepath,
        **kwargs,
    ) -> None:
        """Write the given standardized profile to the given buffer or file."""
        profile.to_parquet(target, **kwargs)
Functions
write(profile: DataFrame[StandardProfile], target: BinaryBufferOrFilepath, **kwargs) -> None classmethod

Write the given standardized profile to the given buffer or file.

Source code in src/taxpasta/infrastructure/application/standard_profile_writer/parquet_standard_profile_writer.py
@classmethod
def write(
    cls,
    profile: DataFrame[StandardProfile],
    target: BinaryBufferOrFilepath,
    **kwargs,
) -> None:
    """Write the given standardized profile to the given buffer or file."""
    profile.to_parquet(target, **kwargs)
tsv_standard_profile_writer

Provide an TSV writer.

Attributes Classes
TSVStandardProfileWriter

Bases: StandardProfileWriter

Define the TSV writer.

Source code in src/taxpasta/infrastructure/application/standard_profile_writer/tsv_standard_profile_writer.py
class TSVStandardProfileWriter(StandardProfileWriter):
    """Define the TSV writer."""

    @classmethod
    def write(
        cls, profile: DataFrame[StandardProfile], target: BufferOrFilepath, **kwargs
    ) -> None:
        """Write the given standardized profile to the given buffer or file."""
        profile.to_csv(target, sep="\t", index=False, **kwargs)
Functions
write(profile: DataFrame[StandardProfile], target: BufferOrFilepath, **kwargs) -> None classmethod

Write the given standardized profile to the given buffer or file.

Source code in src/taxpasta/infrastructure/application/standard_profile_writer/tsv_standard_profile_writer.py
@classmethod
def write(
    cls, profile: DataFrame[StandardProfile], target: BufferOrFilepath, **kwargs
) -> None:
    """Write the given standardized profile to the given buffer or file."""
    profile.to_csv(target, sep="\t", index=False, **kwargs)
xlsx_standard_profile_writer

Provide an XLSX writer.

Attributes Classes
XLSXStandardProfileWriter

Bases: StandardProfileWriter

Define the XLSX writer.

Source code in src/taxpasta/infrastructure/application/standard_profile_writer/xlsx_standard_profile_writer.py
class XLSXStandardProfileWriter(StandardProfileWriter):
    """Define the XLSX writer."""

    @classmethod
    def write(
        cls,
        profile: DataFrame[StandardProfile],
        target: BinaryBufferOrFilepath,
        **kwargs,
    ) -> None:
        """Write the given standardized profile to the given buffer or file."""
        profile.to_excel(target, index=False, engine="openpyxl", **kwargs)
Functions
write(profile: DataFrame[StandardProfile], target: BinaryBufferOrFilepath, **kwargs) -> None classmethod

Write the given standardized profile to the given buffer or file.

Source code in src/taxpasta/infrastructure/application/standard_profile_writer/xlsx_standard_profile_writer.py
@classmethod
def write(
    cls,
    profile: DataFrame[StandardProfile],
    target: BinaryBufferOrFilepath,
    **kwargs,
) -> None:
    """Write the given standardized profile to the given buffer or file."""
    profile.to_excel(target, index=False, engine="openpyxl", **kwargs)
supported_profiler

Provide an enumeration of supported taxonomic profilers.

Classes
SupportedProfiler

Bases: str, Enum

Define supported taxonomic profilers.

Source code in src/taxpasta/infrastructure/application/supported_profiler.py
@unique
class SupportedProfiler(str, Enum):
    """Define supported taxonomic profilers."""

    bracken = "bracken"
    centrifuge = "centrifuge"
    diamond = "diamond"
    ganon = "ganon"
    kaiju = "kaiju"
    kmcp = "kmcp"
    kraken2 = "kraken2"
    krakenuniq = "krakenuniq"
    megan6 = "megan6"
    metaphlan = "metaphlan"
    motus = "motus"
Attributes
bracken = 'bracken' class-attribute instance-attribute
centrifuge = 'centrifuge' class-attribute instance-attribute
diamond = 'diamond' class-attribute instance-attribute
ganon = 'ganon' class-attribute instance-attribute
kaiju = 'kaiju' class-attribute instance-attribute
kmcp = 'kmcp' class-attribute instance-attribute
kraken2 = 'kraken2' class-attribute instance-attribute
krakenuniq = 'krakenuniq' class-attribute instance-attribute
megan6 = 'megan6' class-attribute instance-attribute
metaphlan = 'metaphlan' class-attribute instance-attribute
motus = 'motus' class-attribute instance-attribute
table_reader
Modules
arrow_table_reader

Provide an arrow reader.

Attributes Classes
ArrowTableReader

Bases: TableReader

Define the arrow reader.

Source code in src/taxpasta/infrastructure/application/table_reader/arrow_table_reader.py
class ArrowTableReader(TableReader):
    """Define the arrow reader."""

    @classmethod
    def read(cls, source: BinaryBufferOrFilepath, **kwargs) -> pd.DataFrame:
        """Read arrow from the given source."""
        return pd.read_feather(source, **kwargs)
Functions
read(source: BinaryBufferOrFilepath, **kwargs) -> pd.DataFrame classmethod

Read arrow from the given source.

Source code in src/taxpasta/infrastructure/application/table_reader/arrow_table_reader.py
@classmethod
def read(cls, source: BinaryBufferOrFilepath, **kwargs) -> pd.DataFrame:
    """Read arrow from the given source."""
    return pd.read_feather(source, **kwargs)
csv_table_reader

Provide a CSV reader.

Attributes Classes
CSVTableReader

Bases: TableReader

Define the CSV reader.

Source code in src/taxpasta/infrastructure/application/table_reader/csv_table_reader.py
class CSVTableReader(TableReader):
    """Define the CSV reader."""

    @classmethod
    def read(cls, source: BufferOrFilepath, **kwargs) -> pd.DataFrame:
        """Read CSV from the given source."""
        return pd.read_csv(source, **kwargs)
Functions
read(source: BufferOrFilepath, **kwargs) -> pd.DataFrame classmethod

Read CSV from the given source.

Source code in src/taxpasta/infrastructure/application/table_reader/csv_table_reader.py
@classmethod
def read(cls, source: BufferOrFilepath, **kwargs) -> pd.DataFrame:
    """Read CSV from the given source."""
    return pd.read_csv(source, **kwargs)
ods_table_reader

Provide an ODS reader.

Attributes Classes
ODSTableReader

Bases: TableReader

Define the ODS reader.

Source code in src/taxpasta/infrastructure/application/table_reader/ods_table_reader.py
class ODSTableReader(TableReader):
    """Define the ODS reader."""

    @classmethod
    def read(cls, source: BinaryBufferOrFilepath, **kwargs) -> pd.DataFrame:
        """Read ODS from the given source."""
        return pd.read_excel(source, engine="odf", **kwargs)
Functions
read(source: BinaryBufferOrFilepath, **kwargs) -> pd.DataFrame classmethod

Read ODS from the given source.

Source code in src/taxpasta/infrastructure/application/table_reader/ods_table_reader.py
@classmethod
def read(cls, source: BinaryBufferOrFilepath, **kwargs) -> pd.DataFrame:
    """Read ODS from the given source."""
    return pd.read_excel(source, engine="odf", **kwargs)
parquet_table_reader

Provide an parquet reader.

Attributes Classes
ParquetTableReader

Bases: TableReader

Define the parquet reader.

Source code in src/taxpasta/infrastructure/application/table_reader/parquet_table_reader.py
class ParquetTableReader(TableReader):
    """Define the parquet reader."""

    @classmethod
    def read(cls, source: BinaryBufferOrFilepath, **kwargs) -> pd.DataFrame:
        """Read parquet from the given source."""
        return pd.read_parquet(source, **kwargs)
Functions
read(source: BinaryBufferOrFilepath, **kwargs) -> pd.DataFrame classmethod

Read parquet from the given source.

Source code in src/taxpasta/infrastructure/application/table_reader/parquet_table_reader.py
@classmethod
def read(cls, source: BinaryBufferOrFilepath, **kwargs) -> pd.DataFrame:
    """Read parquet from the given source."""
    return pd.read_parquet(source, **kwargs)
tsv_table_reader

Provide a TSV reader.

Attributes Classes
TSVTableReader

Bases: TableReader

Define the TSV reader.

Source code in src/taxpasta/infrastructure/application/table_reader/tsv_table_reader.py
class TSVTableReader(TableReader):
    """Define the TSV reader."""

    @classmethod
    def read(cls, source: BufferOrFilepath, **kwargs) -> pd.DataFrame:
        """Read TSV from the given source."""
        return pd.read_table(source, sep="\t", **kwargs)
Functions
read(source: BufferOrFilepath, **kwargs) -> pd.DataFrame classmethod

Read TSV from the given source.

Source code in src/taxpasta/infrastructure/application/table_reader/tsv_table_reader.py
@classmethod
def read(cls, source: BufferOrFilepath, **kwargs) -> pd.DataFrame:
    """Read TSV from the given source."""
    return pd.read_table(source, sep="\t", **kwargs)
xlsx_table_reader

Provide an XLSX reader.

Attributes Classes
XLSXTableReader

Bases: TableReader

Define the XLSX reader.

Source code in src/taxpasta/infrastructure/application/table_reader/xlsx_table_reader.py
class XLSXTableReader(TableReader):
    """Define the XLSX reader."""

    @classmethod
    def read(cls, source: BinaryBufferOrFilepath, **kwargs) -> pd.DataFrame:
        """Read XLSX from the given source."""
        return pd.read_excel(source, engine="openpyxl", **kwargs)
Functions
read(source: BinaryBufferOrFilepath, **kwargs) -> pd.DataFrame classmethod

Read XLSX from the given source.

Source code in src/taxpasta/infrastructure/application/table_reader/xlsx_table_reader.py
@classmethod
def read(cls, source: BinaryBufferOrFilepath, **kwargs) -> pd.DataFrame:
    """Read XLSX from the given source."""
    return pd.read_excel(source, engine="openpyxl", **kwargs)
table_reader_file_format

Provide a service for supported tabular file formats.

Classes
TableReaderFileFormat

Bases: str, DependencyCheckMixin, Enum

Define the supported tabular file formats.

Source code in src/taxpasta/infrastructure/application/table_reader_file_format.py
@unique
class TableReaderFileFormat(str, DependencyCheckMixin, Enum):
    """Define the supported tabular file formats."""

    TSV = "TSV"
    CSV = "CSV"
    ODS = "ODS"
    XLSX = "XLSX"
    arrow = "arrow"
    parquet = "parquet"
Attributes
CSV = 'CSV' class-attribute instance-attribute
ODS = 'ODS' class-attribute instance-attribute
TSV = 'TSV' class-attribute instance-attribute
XLSX = 'XLSX' class-attribute instance-attribute
arrow = 'arrow' class-attribute instance-attribute
parquet = 'parquet' class-attribute instance-attribute
tidy_observation_table_file_format

Provide a service for supported tabular file formats.

Classes
TidyObservationTableFileFormat

Bases: str, DependencyCheckMixin, Enum

Define the supported tabular file formats.

Source code in src/taxpasta/infrastructure/application/tidy_observation_table_file_format.py
@unique
class TidyObservationTableFileFormat(str, DependencyCheckMixin, Enum):
    """Define the supported tabular file formats."""

    TSV = "TSV"
    CSV = "CSV"
    ODS = "ODS"
    XLSX = "XLSX"
    arrow = "arrow"
    parquet = "parquet"
Attributes
CSV = 'CSV' class-attribute instance-attribute
ODS = 'ODS' class-attribute instance-attribute
TSV = 'TSV' class-attribute instance-attribute
XLSX = 'XLSX' class-attribute instance-attribute
arrow = 'arrow' class-attribute instance-attribute
parquet = 'parquet' class-attribute instance-attribute
tidy_observation_table_writer
Modules
arrow_table_writer

Provide an arrow writer.

Attributes Classes
ArrowTidyObservationTableWriter

Bases: TidyObservationTableWriter

Define the arrow writer.

Source code in src/taxpasta/infrastructure/application/tidy_observation_table_writer/arrow_table_writer.py
class ArrowTidyObservationTableWriter(TidyObservationTableWriter):
    """Define the arrow writer."""

    @classmethod
    def write(
        cls,
        table: DataFrame[TidyObservationTable],
        target: BinaryBufferOrFilepath,
        **kwargs,
    ) -> None:
        """Write the given table to the given buffer or file."""
        table.to_feather(target, **kwargs)
Functions
write(table: DataFrame[TidyObservationTable], target: BinaryBufferOrFilepath, **kwargs) -> None classmethod

Write the given table to the given buffer or file.

Source code in src/taxpasta/infrastructure/application/tidy_observation_table_writer/arrow_table_writer.py
@classmethod
def write(
    cls,
    table: DataFrame[TidyObservationTable],
    target: BinaryBufferOrFilepath,
    **kwargs,
) -> None:
    """Write the given table to the given buffer or file."""
    table.to_feather(target, **kwargs)
csv_table_writer

Provide a CSV writer.

Attributes Classes
CSVTidyObservationTableWriter

Bases: TidyObservationTableWriter

Define the CSV writer.

Source code in src/taxpasta/infrastructure/application/tidy_observation_table_writer/csv_table_writer.py
class CSVTidyObservationTableWriter(TidyObservationTableWriter):
    """Define the CSV writer."""

    @classmethod
    def write(
        cls, table: DataFrame[TidyObservationTable], target: BufferOrFilepath, **kwargs
    ) -> None:
        """Write the given table to the given buffer or file."""
        table.to_csv(target, index=False, **kwargs)
Functions
write(table: DataFrame[TidyObservationTable], target: BufferOrFilepath, **kwargs) -> None classmethod

Write the given table to the given buffer or file.

Source code in src/taxpasta/infrastructure/application/tidy_observation_table_writer/csv_table_writer.py
@classmethod
def write(
    cls, table: DataFrame[TidyObservationTable], target: BufferOrFilepath, **kwargs
) -> None:
    """Write the given table to the given buffer or file."""
    table.to_csv(target, index=False, **kwargs)
ods_table_writer

Provide an ODS writer.

Attributes Classes
ODSTidyObservationTableWriter

Bases: TidyObservationTableWriter

Define the ODS writer.

Source code in src/taxpasta/infrastructure/application/tidy_observation_table_writer/ods_table_writer.py
class ODSTidyObservationTableWriter(TidyObservationTableWriter):
    """Define the ODS writer."""

    @classmethod
    def write(
        cls,
        table: DataFrame[TidyObservationTable],
        target: BinaryBufferOrFilepath,
        **kwargs,
    ) -> None:
        """Write the given table to the given buffer or file."""
        table.to_excel(target, index=False, engine="odf", **kwargs)
Functions
write(table: DataFrame[TidyObservationTable], target: BinaryBufferOrFilepath, **kwargs) -> None classmethod

Write the given table to the given buffer or file.

Source code in src/taxpasta/infrastructure/application/tidy_observation_table_writer/ods_table_writer.py
@classmethod
def write(
    cls,
    table: DataFrame[TidyObservationTable],
    target: BinaryBufferOrFilepath,
    **kwargs,
) -> None:
    """Write the given table to the given buffer or file."""
    table.to_excel(target, index=False, engine="odf", **kwargs)
parquet_table_writer

Provide an parquet writer.

Attributes Classes
ParquetTidyObservationTableWriter

Bases: TidyObservationTableWriter

Define the parquet writer.

Source code in src/taxpasta/infrastructure/application/tidy_observation_table_writer/parquet_table_writer.py
class ParquetTidyObservationTableWriter(TidyObservationTableWriter):
    """Define the parquet writer."""

    @classmethod
    def write(
        cls,
        table: DataFrame[TidyObservationTable],
        target: BinaryBufferOrFilepath,
        **kwargs,
    ) -> None:
        """Write the given table to the given buffer or file."""
        table.to_parquet(target, **kwargs)
Functions
write(table: DataFrame[TidyObservationTable], target: BinaryBufferOrFilepath, **kwargs) -> None classmethod

Write the given table to the given buffer or file.

Source code in src/taxpasta/infrastructure/application/tidy_observation_table_writer/parquet_table_writer.py
@classmethod
def write(
    cls,
    table: DataFrame[TidyObservationTable],
    target: BinaryBufferOrFilepath,
    **kwargs,
) -> None:
    """Write the given table to the given buffer or file."""
    table.to_parquet(target, **kwargs)
tsv_table_writer

Provide an TSV writer.

Attributes Classes
TSVTidyObservationTableWriter

Bases: TidyObservationTableWriter

Define the TSV writer.

Source code in src/taxpasta/infrastructure/application/tidy_observation_table_writer/tsv_table_writer.py
class TSVTidyObservationTableWriter(TidyObservationTableWriter):
    """Define the TSV writer."""

    @classmethod
    def write(
        cls, table: DataFrame[TidyObservationTable], target: BufferOrFilepath, **kwargs
    ) -> None:
        """Write the given table to the given buffer or file."""
        table.to_csv(target, sep="\t", index=False, **kwargs)
Functions
write(table: DataFrame[TidyObservationTable], target: BufferOrFilepath, **kwargs) -> None classmethod

Write the given table to the given buffer or file.

Source code in src/taxpasta/infrastructure/application/tidy_observation_table_writer/tsv_table_writer.py
@classmethod
def write(
    cls, table: DataFrame[TidyObservationTable], target: BufferOrFilepath, **kwargs
) -> None:
    """Write the given table to the given buffer or file."""
    table.to_csv(target, sep="\t", index=False, **kwargs)
xlsx_table_writer

Provide an XLSX writer.

Attributes Classes
XLSXTidyObservationTableWriter

Bases: TidyObservationTableWriter

Define the XLSX writer.

Source code in src/taxpasta/infrastructure/application/tidy_observation_table_writer/xlsx_table_writer.py
class XLSXTidyObservationTableWriter(TidyObservationTableWriter):
    """Define the XLSX writer."""

    @classmethod
    def write(
        cls,
        table: DataFrame[TidyObservationTable],
        target: BinaryBufferOrFilepath,
        **kwargs,
    ) -> None:
        """Write the given table to the given buffer or file."""
        table.to_excel(target, index=False, engine="openpyxl", **kwargs)
Functions
write(table: DataFrame[TidyObservationTable], target: BinaryBufferOrFilepath, **kwargs) -> None classmethod

Write the given table to the given buffer or file.

Source code in src/taxpasta/infrastructure/application/tidy_observation_table_writer/xlsx_table_writer.py
@classmethod
def write(
    cls,
    table: DataFrame[TidyObservationTable],
    target: BinaryBufferOrFilepath,
    **kwargs,
) -> None:
    """Write the given table to the given buffer or file."""
    table.to_excel(target, index=False, engine="openpyxl", **kwargs)
wide_observation_table_file_format

Provide a service for supported container file formats.

Classes
WideObservationTableFileFormat

Bases: str, DependencyCheckMixin, Enum

Define the supported container file formats.

Source code in src/taxpasta/infrastructure/application/wide_observation_table_file_format.py
@unique
class WideObservationTableFileFormat(str, DependencyCheckMixin, Enum):
    """Define the supported container file formats."""

    TSV = "TSV"
    CSV = "CSV"
    ODS = "ODS"
    XLSX = "XLSX"
    arrow = "arrow"
    parquet = "parquet"
    BIOM = "BIOM"
Attributes
BIOM = 'BIOM' class-attribute instance-attribute
CSV = 'CSV' class-attribute instance-attribute
ODS = 'ODS' class-attribute instance-attribute
TSV = 'TSV' class-attribute instance-attribute
XLSX = 'XLSX' class-attribute instance-attribute
arrow = 'arrow' class-attribute instance-attribute
parquet = 'parquet' class-attribute instance-attribute
wide_observation_table_writer
Modules
arrow_wide_observation_table_writer

Provide an arrow writer.

Attributes Classes
ArrowWideObservationTableWriter

Bases: WideObservationTableWriter

Define the arrow writer.

Source code in src/taxpasta/infrastructure/application/wide_observation_table_writer/arrow_wide_observation_table_writer.py
class ArrowWideObservationTableWriter(WideObservationTableWriter):
    """Define the arrow writer."""

    @classmethod
    def write(
        cls,
        matrix: DataFrame[WideObservationTable],
        target: BinaryBufferOrFilepath,
        **kwargs,
    ) -> None:
        """Write the given table to the given buffer or file."""
        matrix.to_feather(target, **kwargs)
Functions
write(matrix: DataFrame[WideObservationTable], target: BinaryBufferOrFilepath, **kwargs) -> None classmethod

Write the given table to the given buffer or file.

Source code in src/taxpasta/infrastructure/application/wide_observation_table_writer/arrow_wide_observation_table_writer.py
@classmethod
def write(
    cls,
    matrix: DataFrame[WideObservationTable],
    target: BinaryBufferOrFilepath,
    **kwargs,
) -> None:
    """Write the given table to the given buffer or file."""
    matrix.to_feather(target, **kwargs)
biom_wide_observation_table_writer

Provide a Biological Observation Matrix (BIOM) writer.

Attributes Classes
BIOMWideObservationTableWriter

Bases: WideObservationTableWriter

Define the Biological Observation Matrix (BIOM) writer.

Source code in src/taxpasta/infrastructure/application/wide_observation_table_writer/biom_wide_observation_table_writer.py
class BIOMWideObservationTableWriter(WideObservationTableWriter):
    """Define the Biological Observation Matrix (BIOM) writer."""

    @classmethod
    def write(
        cls,
        matrix: DataFrame[WideObservationTable],
        target: Filepath,
        taxonomy: Optional[TaxonomyService] = None,
        generated_by: str = "taxpasta",
        **kwargs,
    ) -> None:
        """Write the given data to the given buffer or file."""
        # Drop unclassified reads.
        matrix = matrix.loc[matrix.iloc[:, 0] != 0].copy()
        if taxonomy is not None:
            observation_meta = taxonomy.format_biom_taxonomy(matrix)
            tmp = taxonomy.add_rank_lineage(matrix)
            for ranks, meta in zip(tmp.rank_lineage, observation_meta):
                meta["rank_lineage"] = ranks
        else:
            observation_meta = None
        result = Table(
            data=matrix.iloc[:, 1:].values,
            observation_ids=matrix.iloc[:, 0].astype(str),
            sample_ids=matrix.columns[1:].astype(str),
            observation_metadata=observation_meta,
            create_date=datetime.utcnow().isoformat(timespec="microseconds"),
        )
        with biom_open(str(target), permission="w") as handle:
            result.to_hdf5(handle, generated_by=generated_by)
Functions
write(matrix: DataFrame[WideObservationTable], target: Filepath, taxonomy: Optional[TaxonomyService] = None, generated_by: str = 'taxpasta', **kwargs) -> None classmethod

Write the given data to the given buffer or file.

Source code in src/taxpasta/infrastructure/application/wide_observation_table_writer/biom_wide_observation_table_writer.py
@classmethod
def write(
    cls,
    matrix: DataFrame[WideObservationTable],
    target: Filepath,
    taxonomy: Optional[TaxonomyService] = None,
    generated_by: str = "taxpasta",
    **kwargs,
) -> None:
    """Write the given data to the given buffer or file."""
    # Drop unclassified reads.
    matrix = matrix.loc[matrix.iloc[:, 0] != 0].copy()
    if taxonomy is not None:
        observation_meta = taxonomy.format_biom_taxonomy(matrix)
        tmp = taxonomy.add_rank_lineage(matrix)
        for ranks, meta in zip(tmp.rank_lineage, observation_meta):
            meta["rank_lineage"] = ranks
    else:
        observation_meta = None
    result = Table(
        data=matrix.iloc[:, 1:].values,
        observation_ids=matrix.iloc[:, 0].astype(str),
        sample_ids=matrix.columns[1:].astype(str),
        observation_metadata=observation_meta,
        create_date=datetime.utcnow().isoformat(timespec="microseconds"),
    )
    with biom_open(str(target), permission="w") as handle:
        result.to_hdf5(handle, generated_by=generated_by)
csv_wide_observation_table_writer

Provide a CSV writer.

Attributes Classes
CSVWideObservationTableWriter

Bases: WideObservationTableWriter

Define the CSV writer.

Source code in src/taxpasta/infrastructure/application/wide_observation_table_writer/csv_wide_observation_table_writer.py
class CSVWideObservationTableWriter(WideObservationTableWriter):
    """Define the CSV writer."""

    @classmethod
    def write(
        cls, matrix: DataFrame[WideObservationTable], target: BufferOrFilepath, **kwargs
    ) -> None:
        """Write the given table to the given buffer or file."""
        matrix.to_csv(target, index=False, **kwargs)
Functions
write(matrix: DataFrame[WideObservationTable], target: BufferOrFilepath, **kwargs) -> None classmethod

Write the given table to the given buffer or file.

Source code in src/taxpasta/infrastructure/application/wide_observation_table_writer/csv_wide_observation_table_writer.py
@classmethod
def write(
    cls, matrix: DataFrame[WideObservationTable], target: BufferOrFilepath, **kwargs
) -> None:
    """Write the given table to the given buffer or file."""
    matrix.to_csv(target, index=False, **kwargs)
ods_wide_observation_table_writer

Provide an ODS writer.

Attributes Classes
ODSWideObservationTableWriter

Bases: WideObservationTableWriter

Define the ODS writer.

Source code in src/taxpasta/infrastructure/application/wide_observation_table_writer/ods_wide_observation_table_writer.py
class ODSWideObservationTableWriter(WideObservationTableWriter):
    """Define the ODS writer."""

    @classmethod
    def write(
        cls,
        matrix: DataFrame[WideObservationTable],
        target: BinaryBufferOrFilepath,
        **kwargs,
    ) -> None:
        """Write the given table to the given buffer or file."""
        matrix.to_excel(target, index=False, engine="odf", **kwargs)
Functions
write(matrix: DataFrame[WideObservationTable], target: BinaryBufferOrFilepath, **kwargs) -> None classmethod

Write the given table to the given buffer or file.

Source code in src/taxpasta/infrastructure/application/wide_observation_table_writer/ods_wide_observation_table_writer.py
@classmethod
def write(
    cls,
    matrix: DataFrame[WideObservationTable],
    target: BinaryBufferOrFilepath,
    **kwargs,
) -> None:
    """Write the given table to the given buffer or file."""
    matrix.to_excel(target, index=False, engine="odf", **kwargs)
parquet_wide_observation_table_writer

Provide an parquet writer.

Attributes Classes
ParquetWideObservationTableWriter

Bases: WideObservationTableWriter

Define the parquet writer.

Source code in src/taxpasta/infrastructure/application/wide_observation_table_writer/parquet_wide_observation_table_writer.py
class ParquetWideObservationTableWriter(WideObservationTableWriter):
    """Define the parquet writer."""

    @classmethod
    def write(
        cls,
        matrix: DataFrame[WideObservationTable],
        target: BinaryBufferOrFilepath,
        **kwargs,
    ) -> None:
        """Write the given table to the given buffer or file."""
        matrix.to_parquet(target, **kwargs)
Functions
write(matrix: DataFrame[WideObservationTable], target: BinaryBufferOrFilepath, **kwargs) -> None classmethod

Write the given table to the given buffer or file.

Source code in src/taxpasta/infrastructure/application/wide_observation_table_writer/parquet_wide_observation_table_writer.py
@classmethod
def write(
    cls,
    matrix: DataFrame[WideObservationTable],
    target: BinaryBufferOrFilepath,
    **kwargs,
) -> None:
    """Write the given table to the given buffer or file."""
    matrix.to_parquet(target, **kwargs)
tsv_wide_observation_table_writer

Provide an TSV writer.

Attributes Classes
TSVWideObservationTableWriter

Bases: WideObservationTableWriter

Define the TSV writer.

Source code in src/taxpasta/infrastructure/application/wide_observation_table_writer/tsv_wide_observation_table_writer.py
class TSVWideObservationTableWriter(WideObservationTableWriter):
    """Define the TSV writer."""

    @classmethod
    def write(
        cls, matrix: DataFrame[WideObservationTable], target: BufferOrFilepath, **kwargs
    ) -> None:
        """Write the given table to the given buffer or file."""
        matrix.to_csv(target, sep="\t", index=False, **kwargs)
Functions
write(matrix: DataFrame[WideObservationTable], target: BufferOrFilepath, **kwargs) -> None classmethod

Write the given table to the given buffer or file.

Source code in src/taxpasta/infrastructure/application/wide_observation_table_writer/tsv_wide_observation_table_writer.py
@classmethod
def write(
    cls, matrix: DataFrame[WideObservationTable], target: BufferOrFilepath, **kwargs
) -> None:
    """Write the given table to the given buffer or file."""
    matrix.to_csv(target, sep="\t", index=False, **kwargs)
xlsx_wide_observation_table_writer

Provide an XLSX writer.

Attributes Classes
XLSXWideObservationTableWriter

Bases: WideObservationTableWriter

Define the XLSX writer.

Source code in src/taxpasta/infrastructure/application/wide_observation_table_writer/xlsx_wide_observation_table_writer.py
class XLSXWideObservationTableWriter(WideObservationTableWriter):
    """Define the XLSX writer."""

    @classmethod
    def write(
        cls,
        matrix: DataFrame[WideObservationTable],
        target: BinaryBufferOrFilepath,
        **kwargs,
    ) -> None:
        """Write the given table to the given buffer or file."""
        matrix.to_excel(target, index=False, engine="openpyxl", **kwargs)
Functions
write(matrix: DataFrame[WideObservationTable], target: BinaryBufferOrFilepath, **kwargs) -> None classmethod

Write the given table to the given buffer or file.

Source code in src/taxpasta/infrastructure/application/wide_observation_table_writer/xlsx_wide_observation_table_writer.py
@classmethod
def write(
    cls,
    matrix: DataFrame[WideObservationTable],
    target: BinaryBufferOrFilepath,
    **kwargs,
) -> None:
    """Write the given table to the given buffer or file."""
    matrix.to_excel(target, index=False, engine="openpyxl", **kwargs)

cli

Attributes

Modules

merge

Add the merge command to the taxpasta CLI.

Attributes
logger = logging.getLogger(__name__) module-attribute
Classes
Functions
merge(profiles: Optional[List[Path]] = typer.Argument(None, metavar='[PROFILE1 PROFILE2 [...]]', help='Two or more files containing taxonomic profiles. Required unless there is a sample sheet. Filenames will be parsed as sample names.', show_default=False), profiler: SupportedProfiler = typer.Option(..., '--profiler', '-p', case_sensitive=False, help='The taxonomic profiler used. All provided profiles must come from the same tool!', show_default=False), sample_sheet: Optional[Path] = typer.Option(None, '--samplesheet', '-s', help="A table with a header and two columns: the first column named 'sample' which can be any string and the second column named 'profile' which must be a file path to an actual taxonomic abundance profile. If this option is provided, any arguments are ignored.", exists=True, file_okay=True, dir_okay=False, readable=True), samplesheet_format: Optional[TableReaderFileFormat] = typer.Option(None, case_sensitive=False, help='The file format of the sample sheet. Depending on the choice, additional package dependencies may apply. Will be parsed from the sample sheet file name but can be set explicitly.'), output: Path = typer.Option(..., '--output', '-o', help='The desired output file. By default, the file extension will be used to determine the output format, but when setting the format explicitly using the --output-format option, automatic detection is disabled.', show_default=False), output_format: Optional[WideObservationTableFileFormat] = typer.Option(None, case_sensitive=False, help='The desired output format. Depending on the choice, additional package dependencies may apply. By default it will be parsed from the output file name but it can be set explicitly and will then disable the automatic detection.'), wide_format: bool = typer.Option(True, '--wide/--long', help='Output merged abundance data in either wide or (tidy) long format. Ignored when the desired output format is BIOM.'), summarise_at: Optional[str] = typer.Option(None, '--summarise-at', '--summarize-at', help="Summarise abundance profiles at higher taxonomic rank. The provided option must match a rank in the taxonomy exactly. This is akin to the clade assigned reads provided by, for example, kraken2, where the abundances of a whole taxonomic branch are assigned to a taxon at the desired rank. Please note that abundances above the selected rank are simply ignored. No attempt is made to redistribute those down to the desired rank. Some tools, like Bracken, were designed for this purpose but it doesn't seem like a problem we can generally solve here."), taxonomy: Optional[Path] = typer.Option(None, help='The path to a directory containing taxdump files. At least nodes.dmp and names.dmp are required. A merged.dmp file is optional.'), add_name: bool = typer.Option(False, '--add-name', help='Add the taxon name to the output.'), add_rank: bool = typer.Option(False, '--add-rank', help='Add the taxon rank to the output.'), add_lineage: bool = typer.Option(False, '--add-lineage', help="Add the taxon's entire lineage to the output. These are taxon names separated by semi-colons."), add_id_lineage: bool = typer.Option(False, '--add-id-lineage', help="Add the taxon's entire lineage to the output. These are taxon identifiers separated by semi-colons."), add_rank_lineage: bool = typer.Option(False, '--add-rank-lineage', help="Add the taxon's entire rank lineage to the output. These are taxon ranks separated by semi-colons."), ignore_errors: bool = typer.Option(False, '--ignore-errors', help='Ignore any metagenomic profiles with errors. Please note that there must be at least two profiles without errors to merge.')) -> None

Standardise and merge two or more taxonomic profiles.

Source code in src/taxpasta/infrastructure/cli/merge.py
@app.command()
def merge(
    profiles: Optional[List[Path]] = typer.Argument(  # noqa: B008
        None,
        metavar="[PROFILE1 PROFILE2 [...]]",
        help="Two or more files containing taxonomic profiles. Required unless there is"
        " a sample sheet. Filenames will be parsed as sample names.",
        show_default=False,
    ),
    profiler: SupportedProfiler = typer.Option(  # noqa: B008
        ...,
        "--profiler",
        "-p",
        case_sensitive=False,
        help="The taxonomic profiler used. All provided profiles must come from the "
        "same tool!",
        show_default=False,
    ),
    sample_sheet: Optional[Path] = typer.Option(  # noqa: B008
        None,
        "--samplesheet",
        "-s",
        help="A table with a header and two columns: the first "
        "column named 'sample' which can be any string and the second column named "
        "'profile' which "
        "must be a file path to an actual taxonomic abundance profile. If this option "
        "is provided, any arguments are ignored.",
        exists=True,
        file_okay=True,
        dir_okay=False,
        readable=True,
    ),
    samplesheet_format: Optional[TableReaderFileFormat] = typer.Option(  # noqa: B008
        None,
        case_sensitive=False,
        help="The file format of the sample sheet. Depending on the choice, additional "
        "package dependencies may apply. Will be parsed from the sample sheet "
        "file name but can be set explicitly.",
    ),
    output: Path = typer.Option(  # noqa: B008
        ...,
        "--output",
        "-o",
        help="The desired output file. By default, the file extension will be used to "
        "determine the output format, but when setting the format explicitly using "
        "the --output-format option, automatic detection is disabled.",
        show_default=False,
    ),
    output_format: Optional[
        WideObservationTableFileFormat
    ] = typer.Option(  # noqa: B008
        None,
        case_sensitive=False,
        help="The desired output format. Depending on the choice, additional package "
        "dependencies may apply. By default it will be parsed from the output file "
        "name but it can be set explicitly and will then disable the automatic "
        "detection.",
    ),
    wide_format: bool = typer.Option(  # noqa: B008
        True,
        "--wide/--long",
        help="Output merged abundance data in either wide or (tidy) long format. "
        "Ignored when the desired output format is BIOM.",
    ),
    summarise_at: Optional[str] = typer.Option(  # noqa: B008
        None,
        "--summarise-at",
        "--summarize-at",
        help="Summarise abundance profiles at higher taxonomic rank. The provided "
        "option must match a rank in the taxonomy exactly. This is akin to the clade "
        "assigned reads provided by, for example, kraken2, where the abundances of a "
        "whole taxonomic branch are assigned to a taxon at the desired rank. Please "
        "note that abundances above the selected rank are simply ignored. No attempt "
        "is made to redistribute those down to the desired rank. Some tools, like "
        "Bracken, were designed for this purpose but it doesn't seem like a problem we "
        "can generally solve here.",
    ),
    taxonomy: Optional[Path] = typer.Option(  # noqa: B008
        None,
        help="The path to a directory containing taxdump files. At least nodes.dmp and "
        "names.dmp are required. A merged.dmp file is optional.",
    ),
    add_name: bool = typer.Option(  # noqa: B008
        False,
        "--add-name",
        help="Add the taxon name to the output.",
    ),
    add_rank: bool = typer.Option(  # noqa: B008
        False,
        "--add-rank",
        help="Add the taxon rank to the output.",
    ),
    add_lineage: bool = typer.Option(  # noqa: B008
        False,
        "--add-lineage",
        help="Add the taxon's entire lineage to the output. These are taxon names "
        "separated by semi-colons.",
    ),
    add_id_lineage: bool = typer.Option(  # noqa: B008
        False,
        "--add-id-lineage",
        help="Add the taxon's entire lineage to the output. These are taxon "
        "identifiers separated by semi-colons.",
    ),
    add_rank_lineage: bool = typer.Option(  # noqa: B008
        False,
        "--add-rank-lineage",
        help="Add the taxon's entire rank lineage to the output. These are taxon "
        "ranks separated by semi-colons.",
    ),
    ignore_errors: bool = typer.Option(  # noqa: B008
        False,
        "--ignore-errors",
        help="Ignore any metagenomic profiles with errors. Please note that there "
        "must be at least two profiles without errors to merge.",
    ),
) -> None:
    """Standardise and merge two or more taxonomic profiles."""
    # Perform input validation.
    valid_output_format: Union[
        TidyObservationTableFileFormat, WideObservationTableFileFormat
    ]
    # When a BIOM output format is chosen, the result can only be a wide format BIOM.
    if output.suffix.lower() == ".biom" or (
        output_format is not None
        and output_format is WideObservationTableFileFormat.BIOM
    ):
        try:
            WideObservationTableFileFormat.check_dependencies(
                WideObservationTableFileFormat.BIOM
            )
        except RuntimeError as error:
            logger.debug("", exc_info=error)
            logger.critical(str(error))
            raise typer.Exit(code=1)
        valid_output_format = WideObservationTableFileFormat.BIOM
        wide_format = True
    else:
        if wide_format:
            valid_output_format = validate_observation_matrix_format(
                output, None if output_format is None else output_format.value
            )
        else:
            valid_output_format = validate_tidy_observation_table_format(
                output, None if output_format is None else output_format.value
            )

    taxonomy_service: Optional[TaxonomyService] = None
    if taxonomy is not None:
        from taxpasta.infrastructure.domain.service.taxopy_taxonomy_service import (
            TaxopyTaxonomyService,
        )

        taxonomy_service = TaxopyTaxonomyService.from_taxdump(taxonomy)

    try:
        command = AddTaxInfoCommand(
            taxonomy_service=taxonomy_service,
            summarise_at=summarise_at,
            add_name=add_name,
            add_rank=add_rank,
            add_lineage=add_lineage,
            add_id_lineage=add_id_lineage,
            add_rank_lineage=add_rank_lineage,
        )
    except ValueError as exc:
        logger.critical(str(exc))
        raise typer.Exit(code=2)
    # Ensure that we can write to the output directory.
    try:
        output.parent.mkdir(parents=True, exist_ok=True)
    except OSError as error:
        logger.critical("Failed to create the parent directory for the output.")
        logger.critical(str(error))
        raise typer.Exit(1)
    # Extract and transform sample data.
    if sample_sheet is not None:
        valid_sample_format = validate_sample_format(sample_sheet, samplesheet_format)
        logger.info("Read sample sheet from '%s'.", str(sample_sheet))
        sheet = read_sample_sheet(sample_sheet, valid_sample_format)
        data = [(row.sample, row.profile) for row in sheet.itertuples(index=False)]
    else:
        if not profiles:
            logger.critical(
                "Neither a sample sheet nor any profiles were provided. Please adjust "
                "the command."
            )
            raise typer.Exit(code=2)
        elif len(profiles) == 1:
            logger.critical(
                "Only a single profile was provided. Please provide at least two."
            )
            raise typer.Exit(code=2)
        # Parse sample names from file names.
        data = [(prof.stem, prof) for prof in profiles]

    handling_app = SampleHandlingApplication(
        profile_reader=ApplicationServiceRegistry.profile_reader(profiler),
        profile_standardiser=ApplicationServiceRegistry.profile_standardisation_service(
            profiler
        ),
        taxonomy_service=taxonomy_service,
    )
    samples = []
    for name, profile in data:
        try:
            samples.append(handling_app.etl_sample(name, profile))
        except StandardisationError as error:
            logger.debug("", exc_info=error)
            if ignore_errors:
                logger.error(
                    "Error in sample '%s' with profile '%s'.",
                    error.sample,
                    error.profile,
                )
                logger.error(error.message)
                continue
            else:
                logger.critical(
                    "Error in sample '%s' with profile '%s'.",
                    error.sample,
                    error.profile,
                )
                logger.critical(error.message)
                raise typer.Exit(code=1)

    if summarise_at:
        summarised = []
        for sample in samples:
            try:
                summarised.append(handling_app.summarise_sample(sample, summarise_at))
            except ValueError as error:
                logger.debug("", exc_info=error)
                if ignore_errors:
                    logger.error("Error in sample '%s'.", sample.name)
                    logger.error(str(error))
                    continue
                else:
                    logger.critical("Error in sample '%s'.", sample.name)
                    logger.critical(str(error))
                    raise typer.Exit(code=1)
        samples = summarised

    if len(samples) < 2:
        logger.critical("Less than two profiles are without errors. Nothing to merge.")
        raise typer.Exit(code=1)

    result = handling_app.merge_samples(samples, wide_format)

    if valid_output_format is not WideObservationTableFileFormat.BIOM:
        result = command.execute(result)

    logger.info("Write result to '%s'.", str(output))
    if wide_format:
        assert isinstance(  # nosec assert_used
            valid_output_format, WideObservationTableFileFormat
        )
        writer = ApplicationServiceRegistry.wide_observation_table_writer(
            valid_output_format
        )
    else:
        assert isinstance(  # nosec assert_used
            valid_output_format, TidyObservationTableFileFormat
        )
        writer = ApplicationServiceRegistry.tidy_observation_table_writer(
            valid_output_format  # type: ignore
        )
    try:
        if valid_output_format is WideObservationTableFileFormat.BIOM:
            writer.write(result, output, taxonomy=taxonomy_service)
        else:
            writer.write(result, output)
    except OSError as error:
        logger.debug("", exc_info=error)
        logger.critical("Failed to write the output result.")
        logger.critical(str(error))
        raise typer.Exit(1)
read_sample_sheet(sample_sheet: Path, sample_format: TableReaderFileFormat) -> DataFrame[SampleSheet]

Extract and validate the sample sheet.

Parameters:

Name Type Description Default
sample_sheet Path

Path to the sample sheet.

required
sample_format TableReaderFileFormat

The determined file format.

required

Returns:

Type Description
DataFrame[SampleSheet]

A pandas data frame in the form of a sample sheet.

Raises:

Type Description
Exit

Early abortion of program when there is a schema error.

Source code in src/taxpasta/infrastructure/cli/merge.py
def read_sample_sheet(
    sample_sheet: Path, sample_format: TableReaderFileFormat
) -> DataFrame[SampleSheet]:
    """
    Extract and validate the sample sheet.

    Args:
        sample_sheet: Path to the sample sheet.
        sample_format: The determined file format.

    Returns:
        A pandas data frame in the form of a sample sheet.

    Raises:
        Exit: Early abortion of program when there is a schema error.

    """
    reader = ApplicationServiceRegistry.table_reader(sample_format)
    result = reader.read(sample_sheet)
    try:
        SampleSheet.validate(result, lazy=True)
    except pandera.errors.SchemaErrors as errors:
        logger.debug("", exc_info=errors)
        logger.critical("Parsing the sample sheet '%s' failed.", str(sample_sheet))
        logger.critical(errors.failure_cases)
        raise typer.Exit(code=1)
    return result
validate_observation_matrix_format(output: Path, output_format: Optional[str]) -> WideObservationTableFileFormat

Detect the output format if it isn't given.

Parameters:

Name Type Description Default
output Path

Path for the output.

required
output_format Optional[str]

The selected file format if any.

required

Returns:

Type Description
WideObservationTableFileFormat

The validated output file format.

Raises:

Type Description
Exit

Early abortion of program when the format cannot be guessed or dependencies are missing.

Source code in src/taxpasta/infrastructure/cli/merge.py
def validate_observation_matrix_format(
    output: Path, output_format: Optional[str]
) -> WideObservationTableFileFormat:
    """
    Detect the output format if it isn't given.

    Args:
        output: Path for the output.
        output_format: The selected file format if any.

    Returns:
        The validated output file format.

    Raises:
        Exit: Early abortion of program when the format cannot be guessed or
            dependencies are missing.

    """
    if output_format is None:
        try:
            result = cast(
                WideObservationTableFileFormat,
                WideObservationTableFileFormat.guess_format(output),
            )
        except ValueError as error:
            logger.critical(str(error))
            logger.critical(
                "Please rename the output or set the '--output-format' explicitly."
            )
            raise typer.Exit(code=2)
    else:
        result = WideObservationTableFileFormat(output_format)
    try:
        WideObservationTableFileFormat.check_dependencies(result)
    except RuntimeError as error:
        logger.debug("", exc_info=error)
        logger.critical(str(error))
        raise typer.Exit(code=1)
    return result
validate_sample_format(sample_sheet: Path, sample_format: Optional[TableReaderFileFormat]) -> TableReaderFileFormat

Detect the sample sheet format if it isn't given.

Parameters:

Name Type Description Default
sample_sheet Path

Path to the sample sheet.

required
sample_format Optional[TableReaderFileFormat]

The selected file format if any.

required

Returns:

Type Description
TableReaderFileFormat

The validated sample sheet format.

Raises:

Type Description
Exit

Early abortion of program when the format cannot be guessed or dependencies are missing.

Source code in src/taxpasta/infrastructure/cli/merge.py
def validate_sample_format(
    sample_sheet: Path, sample_format: Optional[TableReaderFileFormat]
) -> TableReaderFileFormat:
    """
    Detect the sample sheet format if it isn't given.

    Args:
        sample_sheet: Path to the sample sheet.
        sample_format: The selected file format if any.

    Returns:
        The validated sample sheet format.

    Raises:
        Exit: Early abortion of program when the format cannot be guessed or
            dependencies are missing.

    """
    if sample_format is None:
        try:
            result = cast(
                TableReaderFileFormat,
                TableReaderFileFormat.guess_format(sample_sheet),
            )
        except ValueError as error:
            logger.critical(str(error))
            logger.critical(
                "Please rename the sample sheet or set the '--samplesheet-format' "
                "explicitly."
            )
            raise typer.Exit(code=2)
    else:
        result = sample_format
    try:
        TableReaderFileFormat.check_dependencies(result)
    except RuntimeError as error:
        logger.debug("", exc_info=error)
        logger.critical(str(error))
        raise typer.Exit(code=1)
    return result
validate_tidy_observation_table_format(output: Path, output_format: Optional[str]) -> TidyObservationTableFileFormat

Detect the output format if it isn't given.

Parameters:

Name Type Description Default
output Path

Path for the output.

required
output_format Optional[str]

The selected file format if any.

required

Returns:

Type Description
TidyObservationTableFileFormat

The validated output file format.

Raises:

Type Description
Exit

Early abortion of program when the format cannot be guessed or dependencies are missing.

Source code in src/taxpasta/infrastructure/cli/merge.py
def validate_tidy_observation_table_format(
    output: Path, output_format: Optional[str]
) -> TidyObservationTableFileFormat:
    """
    Detect the output format if it isn't given.

    Args:
        output: Path for the output.
        output_format: The selected file format if any.

    Returns:
        The validated output file format.

    Raises:
        Exit: Early abortion of program when the format cannot be guessed or
            dependencies are missing.

    """
    if output_format is None:
        try:
            result = cast(
                TidyObservationTableFileFormat,
                TidyObservationTableFileFormat.guess_format(output),
            )
        except ValueError as error:
            logger.critical(str(error))
            logger.critical(
                "Please rename the output or set the '--output-format' explicitly."
            )
            raise typer.Exit(code=2)
    else:
        result = TidyObservationTableFileFormat(output_format)
    try:
        TidyObservationTableFileFormat.check_dependencies(result)
    except RuntimeError as error:
        logger.debug("", exc_info=error)
        logger.critical(str(error))
        raise typer.Exit(code=1)
    return result
standardise

Add the standardize command to the taxpasta CLI.

Attributes
logger = logging.getLogger(__name__) module-attribute
Classes
Functions
standardise(profile: Path = typer.Argument(..., metavar='PROFILE', help='A file containing a taxonomic profile.', show_default=False), profiler: SupportedProfiler = typer.Option(..., '--profiler', '-p', case_sensitive=False, help='The taxonomic profiler used.', show_default=False), output: Path = typer.Option(..., '--output', '-o', help='The desired output file. By default, the file extension will be used to determine the output format, but when setting the format explicitly using the --output-format option, automatic detection is disabled.', show_default=False), output_format: Optional[StandardProfileFileFormat] = typer.Option(None, case_sensitive=False, help='The desired output format. Depending on the choice, additional package dependencies may apply. By default it will be parsed from the output file name but it can be set explicitly and will then disable the automatic detection.'), summarise_at: Optional[str] = typer.Option(None, '--summarise-at', '--summarize-at', help="Summarise abundance profiles at higher taxonomic rank. The provided option must match a rank in the taxonomy exactly. This is akin to the clade assigned reads provided by, for example, kraken2, where the abundances of a whole taxonomic branch are assigned to a taxon at the desired rank. Please note that abundances above the selected rank are simply ignored. No attempt is made to redistribute those down to the desired rank. Some tools, like Bracken, were designed for this purpose but it doesn't seem like a problem we can generally solve here."), taxonomy: Optional[Path] = typer.Option(None, help='The path to a directory containing taxdump files. At least nodes.dmp and names.dmp are required. A merged.dmp file is optional.'), add_name: bool = typer.Option(False, '--add-name', help='Add the taxon name to the output.'), add_rank: bool = typer.Option(False, '--add-rank', help='Add the taxon rank to the output.'), add_lineage: bool = typer.Option(False, '--add-lineage', help="Add the taxon's entire lineage to the output. These are taxon names separated by semi-colons."), add_id_lineage: bool = typer.Option(False, '--add-id-lineage', help="Add the taxon's entire lineage to the output. These are taxon identifiers separated by semi-colons."), add_rank_lineage: bool = typer.Option(False, '--add-rank-lineage', help="Add the taxon's entire rank lineage to the output. These are taxon ranks separated by semi-colons.")) -> None

Standardise a taxonomic profile.

Source code in src/taxpasta/infrastructure/cli/standardise.py
@app.command(
    no_args_is_help=True, help="Standardise a taxonomic profile (alias: 'standardize')."
)
@app.command("standardize", hidden=True)
def standardise(
    profile: Path = typer.Argument(  # noqa: B008
        ...,
        metavar="PROFILE",
        help="A file containing a taxonomic profile.",
        show_default=False,
    ),
    profiler: SupportedProfiler = typer.Option(  # noqa: B008
        ...,
        "--profiler",
        "-p",
        case_sensitive=False,
        help="The taxonomic profiler used.",
        show_default=False,
    ),
    output: Path = typer.Option(  # noqa: B008
        ...,
        "--output",
        "-o",
        help="The desired output file. By default, the file extension will be used to "
        "determine the output format, but when setting the format explicitly using "
        "the --output-format option, automatic detection is disabled.",
        show_default=False,
    ),
    output_format: Optional[StandardProfileFileFormat] = typer.Option(  # noqa: B008
        None,
        case_sensitive=False,
        help="The desired output format. Depending on the choice, additional package "
        "dependencies may apply. By default it will be parsed from the output file "
        "name but it can be set explicitly and will then disable the automatic "
        "detection.",
    ),
    summarise_at: Optional[str] = typer.Option(  # noqa: B008
        None,
        "--summarise-at",
        "--summarize-at",
        help="Summarise abundance profiles at higher taxonomic rank. The provided "
        "option must match a rank in the taxonomy exactly. This is akin to the clade "
        "assigned reads provided by, for example, kraken2, where the abundances of a "
        "whole taxonomic branch are assigned to a taxon at the desired rank. Please "
        "note that abundances above the selected rank are simply ignored. No attempt "
        "is made to redistribute those down to the desired rank. Some tools, like "
        "Bracken, were designed for this purpose but it doesn't seem like a problem we "
        "can generally solve here.",
    ),
    taxonomy: Optional[Path] = typer.Option(  # noqa: B008
        None,
        help="The path to a directory containing taxdump files. At least nodes.dmp and "
        "names.dmp are required. A merged.dmp file is optional.",
    ),
    add_name: bool = typer.Option(  # noqa: B008
        False,
        "--add-name",
        help="Add the taxon name to the output.",
    ),
    add_rank: bool = typer.Option(  # noqa: B008
        False,
        "--add-rank",
        help="Add the taxon rank to the output.",
    ),
    add_lineage: bool = typer.Option(  # noqa: B008
        False,
        "--add-lineage",
        help="Add the taxon's entire lineage to the output. These are taxon names "
        "separated by semi-colons.",
    ),
    add_id_lineage: bool = typer.Option(  # noqa: B008
        False,
        "--add-id-lineage",
        help="Add the taxon's entire lineage to the output. These are taxon "
        "identifiers separated by semi-colons.",
    ),
    add_rank_lineage: bool = typer.Option(  # noqa: B008
        False,
        "--add-rank-lineage",
        help="Add the taxon's entire rank lineage to the output. These are taxon "
        "ranks separated by semi-colons.",
    ),
) -> None:
    """Standardise a taxonomic profile."""
    # Perform input validation.
    valid_output_format = validate_output_format(
        output, None if output_format is None else output_format.value
    )

    taxonomy_service: Optional[TaxonomyService] = None
    if taxonomy is not None:
        from taxpasta.infrastructure.domain.service.taxopy_taxonomy_service import (
            TaxopyTaxonomyService,
        )

        taxonomy_service = TaxopyTaxonomyService.from_taxdump(taxonomy)

    try:
        tax_info_command = AddTaxInfoCommand(
            taxonomy_service=taxonomy_service,
            summarise_at=summarise_at,
            add_name=add_name,
            add_rank=add_rank,
            add_lineage=add_lineage,
            add_id_lineage=add_id_lineage,
            add_rank_lineage=add_rank_lineage,
        )
    except ValueError as exc:
        logger.critical(str(exc))
        raise typer.Exit(code=2)

    # Ensure that we can write to the output directory.
    try:
        output.parent.mkdir(parents=True, exist_ok=True)
    except OSError as error:
        logger.critical("Failed to create the parent directory for the output.")
        logger.critical(str(error))
        raise typer.Exit(1)

    handling_app = SampleHandlingApplication(
        profile_reader=ApplicationServiceRegistry.profile_reader(profiler),
        profile_standardiser=ApplicationServiceRegistry.profile_standardisation_service(
            profiler
        ),
        taxonomy_service=taxonomy_service,
    )
    try:
        sample = handling_app.etl_sample(profile.stem, profile)
    except StandardisationError as error:
        logger.debug("", exc_info=error)
        logger.critical(
            "Error in sample '%s' with profile '%s'.", error.sample, error.profile
        )
        logger.critical(error.message)
        raise typer.Exit(code=1)

    if summarise_at:
        try:
            sample = handling_app.summarise_sample(sample, summarise_at)
        except ValueError as error:
            logger.debug("", exc_info=error)
            logger.critical("Error in sample '%s'.", sample.name)
            logger.critical(str(error))
            raise typer.Exit(code=1)

    result = tax_info_command.execute(sample.profile)

    logger.info("Write result to '%s'.", str(output))
    writer = ApplicationServiceRegistry.standard_profile_writer(valid_output_format)
    try:
        writer.write(result, output)
    except OSError as error:
        logger.critical("Failed to write the output result.")
        logger.critical(str(error))
        raise typer.Exit(1)
validate_output_format(output: Path, output_format: Optional[str]) -> StandardProfileFileFormat

Detect the output format if it isn't given.

Parameters:

Name Type Description Default
output Path

Path for the output.

required
output_format Optional[str]

The selected file format if any.

required

Returns:

Type Description
StandardProfileFileFormat

The validated output file format.

Raises:

Type Description
Exit

Early abortion of program when the format cannot be guessed or dependencies are missing.

Source code in src/taxpasta/infrastructure/cli/standardise.py
def validate_output_format(
    output: Path, output_format: Optional[str]
) -> StandardProfileFileFormat:
    """
    Detect the output format if it isn't given.

    Args:
        output: Path for the output.
        output_format: The selected file format if any.

    Returns:
        The validated output file format.

    Raises:
        Exit: Early abortion of program when the format cannot be guessed or
            dependencies are missing.

    """
    if output_format is None:
        try:
            result = cast(
                StandardProfileFileFormat,
                StandardProfileFileFormat.guess_format(output),
            )
        except ValueError as error:
            logger.critical(str(error))
            logger.critical(
                "Please rename the output or set the '--output-format' explicitly."
            )
            raise typer.Exit(code=2)
    else:
        result = StandardProfileFileFormat(output_format)
    try:
        StandardProfileFileFormat.check_dependencies(result)
    except RuntimeError as error:
        logger.debug("", exc_info=error)
        logger.critical(str(error))
        raise typer.Exit(code=1)
    return result
taxpasta

Provide a command-line interface (CLI) for taxpasta functionality.

Attributes
app = typer.Typer(help='TAXonomic Profile Aggregation and STAndardisation', context_settings={'help_option_names': ['-h', '--help']}) module-attribute
logger = logging.getLogger('taxpasta') module-attribute
Classes
LogLevel

Bases: str, Enum

Define the choices for the log level option.

Source code in src/taxpasta/infrastructure/cli/taxpasta.py
@unique
class LogLevel(str, Enum):
    """Define the choices for the log level option."""

    DEBUG = "DEBUG"
    INFO = "INFO"
    WARNING = "WARNING"
    ERROR = "ERROR"
    CRITICAL = "CRITICAL"
Attributes
CRITICAL = 'CRITICAL' class-attribute instance-attribute
DEBUG = 'DEBUG' class-attribute instance-attribute
ERROR = 'ERROR' class-attribute instance-attribute
INFO = 'INFO' class-attribute instance-attribute
WARNING = 'WARNING' class-attribute instance-attribute
Functions
initialize(context: typer.Context, version: Optional[bool] = typer.Option(None, '--version', callback=version_callback, is_eager=True, help='Print only the current tool version and exit.'), log_level: LogLevel = typer.Option(LogLevel.INFO.name, '--log-level', '-l', case_sensitive=False, help='Set the desired log level.'))

Initialize logging and rich printing if available.

Source code in src/taxpasta/infrastructure/cli/taxpasta.py
@app.callback(invoke_without_command=True)
def initialize(
    context: typer.Context,
    version: Optional[bool] = typer.Option(  # noqa: B008
        None,
        "--version",
        callback=version_callback,
        is_eager=True,
        help="Print only the current tool version and exit.",
    ),
    log_level: LogLevel = typer.Option(  # noqa: B008
        LogLevel.INFO.name,
        "--log-level",
        "-l",
        case_sensitive=False,
        help="Set the desired log level.",
    ),
):
    """Initialize logging and rich printing if available."""
    try:
        from rich.logging import RichHandler

        logging.basicConfig(
            level=log_level.name,
            format="%(message)s",
            datefmt="[%X]",
            handlers=[RichHandler(rich_tracebacks=True, tracebacks_suppress=[typer])],
        )
    except ModuleNotFoundError:
        logging.basicConfig(level=log_level.name, format="[%(levelname)s] %(message)s")
version_callback(is_set: bool) -> None

Print the tool version if desired.

Parameters:

Name Type Description Default
is_set bool

Whether the version was requested as a command line option.

required

Raises:

Type Description
Exit

With default code 0 to signal normal program end.

Source code in src/taxpasta/infrastructure/cli/taxpasta.py
def version_callback(is_set: bool) -> None:
    """
    Print the tool version if desired.

    Args:
        is_set: Whether the version was requested as a command line option.

    Raises:
        Exit: With default code 0 to signal normal program end.

    """
    if is_set:
        print(taxpasta.__version__)
        raise typer.Exit()
Modules

domain

Provide concrete implementations of domain models and services.

Modules

service

Provide concrete implementations of domain services.

Modules
taxopy_taxonomy_service

Provide a taxonomy service based on taxopy.

Attributes
logger = logging.getLogger(__name__) module-attribute
Classes
TaxopyTaxonomyService

Bases: TaxonomyService

Define the taxonomy service based on taxopy.

Source code in src/taxpasta/infrastructure/domain/service/taxopy_taxonomy_service.py
class TaxopyTaxonomyService(TaxonomyService):
    """Define the taxonomy service based on taxopy."""

    def __init__(self, *, tax_db: taxopy.TaxDb, **kwargs) -> None:
        """Initialize a taxonomy service instance with a taxopy database."""
        super().__init__(**kwargs)
        self._tax_db = tax_db

    @classmethod
    def from_taxdump(cls, source: Path) -> TaxopyTaxonomyService:
        """Create a service instance from a directory path containing taxdump info."""
        merged = source / "merged.dmp"
        return cls(
            tax_db=taxopy.TaxDb(
                names_dmp=str(source / "names.dmp"),
                nodes_dmp=str(source / "nodes.dmp"),
                merged_dmp=str(merged) if merged.is_file() else None,
                keep_files=True,
            )
        )

    def get_taxon_name(self, taxonomy_id: int) -> Optional[str]:
        """Return the name of a given taxonomy identifier."""
        return self._tax_db.taxid2name.get(taxonomy_id)

    def get_taxon_rank(self, taxonomy_id: int) -> Optional[str]:
        """Return the rank of a given taxonomy identifier."""
        return self._tax_db.taxid2rank.get(taxonomy_id)

    def get_taxon_name_lineage(self, taxonomy_id: int) -> Optional[List[str]]:
        """
        Return the lineage of a given taxonomy identifier as names.

        Only names with associated ranks are included.

        """
        try:
            taxon = taxopy.Taxon(taxid=taxonomy_id, taxdb=self._tax_db)
        except TaxidError:
            return None
        return list(reversed(taxon.rank_name_dictionary.values()))

    def get_taxon_identifier_lineage(self, taxonomy_id: int) -> Optional[List[int]]:
        """
        Return the lineage of a given taxonomy identifier as identifiers.

        Only identifiers with associated ranks are included.

        """
        try:
            taxon = taxopy.Taxon(taxid=taxonomy_id, taxdb=self._tax_db)
        except TaxidError:
            return None
        return list(reversed(taxon.rank_taxid_dictionary.values()))

    def get_taxon_rank_lineage(self, taxonomy_id: int) -> Optional[List[str]]:
        """Return the lineage of a given taxonomy identifier as ranks."""
        try:
            taxon = taxopy.Taxon(taxid=taxonomy_id, taxdb=self._tax_db)
        except TaxidError:
            return None
        return list(reversed(taxon.rank_name_dictionary.keys()))

    def add_name(self, table: DataFrame[ResultTable]) -> DataFrame[ResultTable]:
        """Add a column for the taxon name to the given table."""
        result = table.copy()
        result.insert(
            1,
            "name",
            table.taxonomy_id.map(self._tax_db.taxid2name),
        )
        return result

    def add_rank(self, table: DataFrame[ResultTable]) -> DataFrame[ResultTable]:
        """Add a column for the taxon rank to the given table."""
        result = table.copy()
        result.insert(
            1,
            "rank",
            table.taxonomy_id.map(self._tax_db.taxid2rank),
        )
        return result

    def add_name_lineage(self, table: DataFrame[ResultTable]) -> DataFrame[ResultTable]:
        """Add a column for the taxon lineage to the given table."""
        result = table.copy()
        result.insert(
            1,
            "lineage",
            table.taxonomy_id.map(self._name_lineage_as_str),
        )
        return result

    def _name_lineage_as_str(self, taxonomy_id: int) -> Optional[str]:
        """Return the lineage of a taxon as concatenated names."""
        if lineage := self.get_taxon_name_lineage(taxonomy_id):
            return ";".join(lineage)
        else:
            return None

    def add_identifier_lineage(
        self, table: DataFrame[ResultTable]
    ) -> DataFrame[ResultTable]:
        """Add a column for the taxon lineage as identifiers to the given table."""
        result = table.copy()
        result.insert(
            1,
            "id_lineage",
            table.taxonomy_id.map(self._taxid_lineage_as_str),
        )
        return result

    def _taxid_lineage_as_str(self, taxonomy_id: int) -> Optional[str]:
        """Return the lineage of a taxon as concatenated identifiers."""
        if lineage := self.get_taxon_identifier_lineage(taxonomy_id):
            return ";".join(str(tax_id) for tax_id in lineage)
        else:
            return None

    def add_rank_lineage(self, table: DataFrame[ResultTable]) -> DataFrame[ResultTable]:
        """Add a column for the taxon lineage as ranks to the given table."""
        result = table.copy()
        result.insert(
            1,
            "rank_lineage",
            table.taxonomy_id.map(self._rank_lineage_as_str),
        )
        return result

    def _rank_lineage_as_str(self, taxonomy_id: int) -> Optional[str]:
        """Return the rank lineage of a taxon as concatenated identifiers."""
        if lineage := self.get_taxon_rank_lineage(taxonomy_id):
            return ";".join(lineage)
        else:
            return None

    def format_biom_taxonomy(
        self, table: DataFrame[ResultTable]
    ) -> List[Dict[str, List[str]]]:
        """Format the taxonomy as BIOM observation metadata."""
        lineages = [self._get_rank_name(tax_id) for tax_id in table.taxonomy_id]
        result = [{} if lineage is None else lineage for lineage in lineages]
        longest_lineage = max(result, key=len)
        max_ranks = list(reversed(longest_lineage.keys()))
        return [
            {"taxonomy": self._pad_lineage(lineage, max_ranks)} for lineage in result
        ]

    def _get_rank_name(self, taxonomy_id: int) -> Optional[Dict[str, str]]:
        try:
            taxon = taxopy.Taxon(taxid=taxonomy_id, taxdb=self._tax_db)
        except TaxidError:
            return None
        return taxon.rank_name_dictionary

    def _pad_lineage(self, lineage: Dict[str, str], max_ranks: List[str]) -> List[str]:
        """Pad a lineage to match the length of the longest lineage."""
        return [lineage.get(rank, "") for rank in max_ranks]

    def summarise_at(
        self, profile: DataFrame[StandardProfile], rank: str
    ) -> DataFrame[StandardProfile]:
        """Summarise a standardised abundance profile at a higher taxonomic rank."""
        branching = defaultdict(list)
        for tax_id in profile[StandardProfile.taxonomy_id]:
            # For now, we ignore the identifier zero (unclassified).
            if tax_id == 0:
                continue
            taxon = taxopy.Taxon(taxid=tax_id, taxdb=self._tax_db)
            if taxon.rank == rank:
                branching[taxon.taxid].append(taxon.taxid)
                continue
            for parent_id in taxon.taxid_lineage:
                ancestor_rank = self._tax_db.taxid2rank[parent_id]
                if ancestor_rank == rank:
                    # We do not need to summarize further than to the desired rank.
                    branching[parent_id].append(taxon.taxid)
                    break
            else:
                # We did not encounter the desired rank. Likely, the taxon is situated
                # above the desired rank in the taxonomy.
                logger.debug(
                    "The desired rank '%s' is not in the lineage of the taxon %d - %s.",
                    rank,
                    taxon.taxid,
                    taxon.name,
                )
        finalized = dict(branching)
        root_ids = sorted(finalized)
        counts = []
        for root_id in root_ids:
            leaves = finalized[root_id]
            counts.append(
                profile.loc[
                    profile[StandardProfile.taxonomy_id].isin(leaves),
                    StandardProfile.count,
                ].sum()
            )
        return pd.DataFrame(
            {
                StandardProfile.taxonomy_id: pd.Series(data=root_ids, dtype="category"),
                StandardProfile.count: counts,
            }
        )
Functions
__init__(*, tax_db: taxopy.TaxDb, **kwargs) -> None

Initialize a taxonomy service instance with a taxopy database.

Source code in src/taxpasta/infrastructure/domain/service/taxopy_taxonomy_service.py
def __init__(self, *, tax_db: taxopy.TaxDb, **kwargs) -> None:
    """Initialize a taxonomy service instance with a taxopy database."""
    super().__init__(**kwargs)
    self._tax_db = tax_db
add_identifier_lineage(table: DataFrame[ResultTable]) -> DataFrame[ResultTable]

Add a column for the taxon lineage as identifiers to the given table.

Source code in src/taxpasta/infrastructure/domain/service/taxopy_taxonomy_service.py
def add_identifier_lineage(
    self, table: DataFrame[ResultTable]
) -> DataFrame[ResultTable]:
    """Add a column for the taxon lineage as identifiers to the given table."""
    result = table.copy()
    result.insert(
        1,
        "id_lineage",
        table.taxonomy_id.map(self._taxid_lineage_as_str),
    )
    return result
add_name(table: DataFrame[ResultTable]) -> DataFrame[ResultTable]

Add a column for the taxon name to the given table.

Source code in src/taxpasta/infrastructure/domain/service/taxopy_taxonomy_service.py
def add_name(self, table: DataFrame[ResultTable]) -> DataFrame[ResultTable]:
    """Add a column for the taxon name to the given table."""
    result = table.copy()
    result.insert(
        1,
        "name",
        table.taxonomy_id.map(self._tax_db.taxid2name),
    )
    return result
add_name_lineage(table: DataFrame[ResultTable]) -> DataFrame[ResultTable]

Add a column for the taxon lineage to the given table.

Source code in src/taxpasta/infrastructure/domain/service/taxopy_taxonomy_service.py
def add_name_lineage(self, table: DataFrame[ResultTable]) -> DataFrame[ResultTable]:
    """Add a column for the taxon lineage to the given table."""
    result = table.copy()
    result.insert(
        1,
        "lineage",
        table.taxonomy_id.map(self._name_lineage_as_str),
    )
    return result
add_rank(table: DataFrame[ResultTable]) -> DataFrame[ResultTable]

Add a column for the taxon rank to the given table.

Source code in src/taxpasta/infrastructure/domain/service/taxopy_taxonomy_service.py
def add_rank(self, table: DataFrame[ResultTable]) -> DataFrame[ResultTable]:
    """Add a column for the taxon rank to the given table."""
    result = table.copy()
    result.insert(
        1,
        "rank",
        table.taxonomy_id.map(self._tax_db.taxid2rank),
    )
    return result
add_rank_lineage(table: DataFrame[ResultTable]) -> DataFrame[ResultTable]

Add a column for the taxon lineage as ranks to the given table.

Source code in src/taxpasta/infrastructure/domain/service/taxopy_taxonomy_service.py
def add_rank_lineage(self, table: DataFrame[ResultTable]) -> DataFrame[ResultTable]:
    """Add a column for the taxon lineage as ranks to the given table."""
    result = table.copy()
    result.insert(
        1,
        "rank_lineage",
        table.taxonomy_id.map(self._rank_lineage_as_str),
    )
    return result
format_biom_taxonomy(table: DataFrame[ResultTable]) -> List[Dict[str, List[str]]]

Format the taxonomy as BIOM observation metadata.

Source code in src/taxpasta/infrastructure/domain/service/taxopy_taxonomy_service.py
def format_biom_taxonomy(
    self, table: DataFrame[ResultTable]
) -> List[Dict[str, List[str]]]:
    """Format the taxonomy as BIOM observation metadata."""
    lineages = [self._get_rank_name(tax_id) for tax_id in table.taxonomy_id]
    result = [{} if lineage is None else lineage for lineage in lineages]
    longest_lineage = max(result, key=len)
    max_ranks = list(reversed(longest_lineage.keys()))
    return [
        {"taxonomy": self._pad_lineage(lineage, max_ranks)} for lineage in result
    ]
from_taxdump(source: Path) -> TaxopyTaxonomyService classmethod

Create a service instance from a directory path containing taxdump info.

Source code in src/taxpasta/infrastructure/domain/service/taxopy_taxonomy_service.py
@classmethod
def from_taxdump(cls, source: Path) -> TaxopyTaxonomyService:
    """Create a service instance from a directory path containing taxdump info."""
    merged = source / "merged.dmp"
    return cls(
        tax_db=taxopy.TaxDb(
            names_dmp=str(source / "names.dmp"),
            nodes_dmp=str(source / "nodes.dmp"),
            merged_dmp=str(merged) if merged.is_file() else None,
            keep_files=True,
        )
    )
get_taxon_identifier_lineage(taxonomy_id: int) -> Optional[List[int]]

Return the lineage of a given taxonomy identifier as identifiers.

Only identifiers with associated ranks are included.

Source code in src/taxpasta/infrastructure/domain/service/taxopy_taxonomy_service.py
def get_taxon_identifier_lineage(self, taxonomy_id: int) -> Optional[List[int]]:
    """
    Return the lineage of a given taxonomy identifier as identifiers.

    Only identifiers with associated ranks are included.

    """
    try:
        taxon = taxopy.Taxon(taxid=taxonomy_id, taxdb=self._tax_db)
    except TaxidError:
        return None
    return list(reversed(taxon.rank_taxid_dictionary.values()))
get_taxon_name(taxonomy_id: int) -> Optional[str]

Return the name of a given taxonomy identifier.

Source code in src/taxpasta/infrastructure/domain/service/taxopy_taxonomy_service.py
def get_taxon_name(self, taxonomy_id: int) -> Optional[str]:
    """Return the name of a given taxonomy identifier."""
    return self._tax_db.taxid2name.get(taxonomy_id)
get_taxon_name_lineage(taxonomy_id: int) -> Optional[List[str]]

Return the lineage of a given taxonomy identifier as names.

Only names with associated ranks are included.

Source code in src/taxpasta/infrastructure/domain/service/taxopy_taxonomy_service.py
def get_taxon_name_lineage(self, taxonomy_id: int) -> Optional[List[str]]:
    """
    Return the lineage of a given taxonomy identifier as names.

    Only names with associated ranks are included.

    """
    try:
        taxon = taxopy.Taxon(taxid=taxonomy_id, taxdb=self._tax_db)
    except TaxidError:
        return None
    return list(reversed(taxon.rank_name_dictionary.values()))
get_taxon_rank(taxonomy_id: int) -> Optional[str]

Return the rank of a given taxonomy identifier.

Source code in src/taxpasta/infrastructure/domain/service/taxopy_taxonomy_service.py
def get_taxon_rank(self, taxonomy_id: int) -> Optional[str]:
    """Return the rank of a given taxonomy identifier."""
    return self._tax_db.taxid2rank.get(taxonomy_id)
get_taxon_rank_lineage(taxonomy_id: int) -> Optional[List[str]]

Return the lineage of a given taxonomy identifier as ranks.

Source code in src/taxpasta/infrastructure/domain/service/taxopy_taxonomy_service.py
def get_taxon_rank_lineage(self, taxonomy_id: int) -> Optional[List[str]]:
    """Return the lineage of a given taxonomy identifier as ranks."""
    try:
        taxon = taxopy.Taxon(taxid=taxonomy_id, taxdb=self._tax_db)
    except TaxidError:
        return None
    return list(reversed(taxon.rank_name_dictionary.keys()))
summarise_at(profile: DataFrame[StandardProfile], rank: str) -> DataFrame[StandardProfile]

Summarise a standardised abundance profile at a higher taxonomic rank.

Source code in src/taxpasta/infrastructure/domain/service/taxopy_taxonomy_service.py
def summarise_at(
    self, profile: DataFrame[StandardProfile], rank: str
) -> DataFrame[StandardProfile]:
    """Summarise a standardised abundance profile at a higher taxonomic rank."""
    branching = defaultdict(list)
    for tax_id in profile[StandardProfile.taxonomy_id]:
        # For now, we ignore the identifier zero (unclassified).
        if tax_id == 0:
            continue
        taxon = taxopy.Taxon(taxid=tax_id, taxdb=self._tax_db)
        if taxon.rank == rank:
            branching[taxon.taxid].append(taxon.taxid)
            continue
        for parent_id in taxon.taxid_lineage:
            ancestor_rank = self._tax_db.taxid2rank[parent_id]
            if ancestor_rank == rank:
                # We do not need to summarize further than to the desired rank.
                branching[parent_id].append(taxon.taxid)
                break
        else:
            # We did not encounter the desired rank. Likely, the taxon is situated
            # above the desired rank in the taxonomy.
            logger.debug(
                "The desired rank '%s' is not in the lineage of the taxon %d - %s.",
                rank,
                taxon.taxid,
                taxon.name,
            )
    finalized = dict(branching)
    root_ids = sorted(finalized)
    counts = []
    for root_id in root_ids:
        leaves = finalized[root_id]
        counts.append(
            profile.loc[
                profile[StandardProfile.taxonomy_id].isin(leaves),
                StandardProfile.count,
            ].sum()
        )
    return pd.DataFrame(
        {
            StandardProfile.taxonomy_id: pd.Series(data=root_ids, dtype="category"),
            StandardProfile.count: counts,
        }
    )

helpers

Provide general helpers.

Classes

Functions

Modules

base_data_frame_model

Provide a base data frame model for general checks and configuration.

Classes
BaseDataFrameModel

Bases: DataFrameModel

Define the base data frame model for general checks and configuration.

Source code in src/taxpasta/infrastructure/helpers/base_data_frame_model.py
class BaseDataFrameModel(pa.DataFrameModel):
    """Define the base data frame model for general checks and configuration."""

    @pa.dataframe_check
    def check_not_empty(cls, profile: pd.DataFrame) -> bool:
        """Check that the read in profile is *not* empty."""
        return not profile.empty

    class Config:
        """Configure the schema model."""

        coerce = False
        ordered = True
        strict = True
Classes
Config

Configure the schema model.

Source code in src/taxpasta/infrastructure/helpers/base_data_frame_model.py
class Config:
    """Configure the schema model."""

    coerce = False
    ordered = True
    strict = True
Attributes
coerce = False class-attribute instance-attribute
ordered = True class-attribute instance-attribute
strict = True class-attribute instance-attribute
Functions
check_not_empty(profile: pd.DataFrame) -> bool

Check that the read in profile is not empty.

Source code in src/taxpasta/infrastructure/helpers/base_data_frame_model.py
@pa.dataframe_check
def check_not_empty(cls, profile: pd.DataFrame) -> bool:
    """Check that the read in profile is *not* empty."""
    return not profile.empty
decorators

Provide general decorators.

Functions
raise_parser_warnings(func: Callable) -> Callable

Decorate a function in order to raise parser warnings as value errors.

Source code in src/taxpasta/infrastructure/helpers/decorators.py
def raise_parser_warnings(func: Callable) -> Callable:
    """Decorate a function in order to raise parser warnings as value errors."""

    @wraps(func)
    def wrapped(*args, **kwargs) -> Any:
        with warnings.catch_warnings():
            warnings.filterwarnings(action="error", category=ParserWarning)
            try:
                result = func(*args, **kwargs)
            except ParserWarning as exc:
                raise ValueError(
                    "There were unexpected issues with the data. Please double-check "
                    "the specific combination of your chosen metagenomic profiler and "
                    "input profile."
                ) from exc
        return result

    return wrapped