Skip to content






Provide a description of a sample.


Bases: NamedTuple

Define a sample.

Source code in src/taxpasta/domain/model/
class Sample(NamedTuple):
    """Define a sample."""

    name: str
    profile: DataFrame[StandardProfile]
name: str instance-attribute
profile: DataFrame[StandardProfile] instance-attribute

Provide a description of the standard profile format.


Bases: DataFrameModel

Define the standard profile format.

Source code in src/taxpasta/domain/model/
class StandardProfile(pa.DataFrameModel):
    """Define the standard profile format."""

    taxonomy_id: Series[pd.CategoricalDtype] = pa.Field()
    count: Series[int] = pa.Field(ge=0)

    class Config:
        """Configure the schema model."""

        coerce = True
        ordered = True
        strict = True
count: Series[int] = pa.Field(ge=0) class-attribute instance-attribute
taxonomy_id: Series[pd.CategoricalDtype] = pa.Field() class-attribute instance-attribute

Configure the schema model.

Source code in src/taxpasta/domain/model/
class Config:
    """Configure the schema model."""

    coerce = True
    ordered = True
    strict = True
coerce = True class-attribute instance-attribute
ordered = True class-attribute instance-attribute
strict = True class-attribute instance-attribute

Provide a description of a tidy observation table.


Bases: DataFrameModel

Define the tidy observation table.

Source code in src/taxpasta/domain/model/
class TidyObservationTable(pa.DataFrameModel):
    """Define the tidy observation table."""

    taxonomy_id: Series[pd.CategoricalDtype] = pa.Field()
    name: Optional[Series[pd.CategoricalDtype]] = pa.Field()
    rank: Optional[Series[pd.CategoricalDtype]] = pa.Field()
    lineage: Optional[Series[pd.CategoricalDtype]] = pa.Field()
    id_lineage: Optional[Series[pd.CategoricalDtype]] = pa.Field()
    rank_lineage: Optional[Series[pd.CategoricalDtype]] = pa.Field()
    count: Series[np.int64] = pa.Field(ge=0)
    sample: Series[pd.CategoricalDtype] = pa.Field()

    class Config:
        """Configure the schema model."""

        coerce = True
        ordered = True
        strict = True
count: Series[np.int64] = pa.Field(ge=0) class-attribute instance-attribute
id_lineage: Optional[Series[pd.CategoricalDtype]] = pa.Field() class-attribute instance-attribute
lineage: Optional[Series[pd.CategoricalDtype]] = pa.Field() class-attribute instance-attribute
name: Optional[Series[pd.CategoricalDtype]] = pa.Field() class-attribute instance-attribute
rank: Optional[Series[pd.CategoricalDtype]] = pa.Field() class-attribute instance-attribute
rank_lineage: Optional[Series[pd.CategoricalDtype]] = pa.Field() class-attribute instance-attribute
sample: Series[pd.CategoricalDtype] = pa.Field() class-attribute instance-attribute
taxonomy_id: Series[pd.CategoricalDtype] = pa.Field() class-attribute instance-attribute

Configure the schema model.

Source code in src/taxpasta/domain/model/
class Config:
    """Configure the schema model."""

    coerce = True
    ordered = True
    strict = True
coerce = True class-attribute instance-attribute
ordered = True class-attribute instance-attribute
strict = True class-attribute instance-attribute

Provide a description of an observation matrix.


Bases: DataFrameModel

Define the observation matrix.

Source code in src/taxpasta/domain/model/
class WideObservationTable(pa.DataFrameModel):
    """Define the observation matrix."""

    taxonomy_id: Series[pd.CategoricalDtype] = pa.Field()
    name: Optional[Series[pd.CategoricalDtype]] = pa.Field()
    rank: Optional[Series[pd.CategoricalDtype]] = pa.Field()
    lineage: Optional[Series[pd.CategoricalDtype]] = pa.Field()
    id_lineage: Optional[Series[pd.CategoricalDtype]] = pa.Field()
    rank_lineage: Optional[Series[pd.CategoricalDtype]] = pa.Field()
    # This field uses a regex to match all columns that are not one of the above.
    any_samples: Series[np.int64] = pa.Field(

    class Config:
        """Configure the schema model."""

        coerce = True
        ordered = True
any_samples: Series[np.int64] = pa.Field(ge=0, alias='^(?!(taxonomy_id|name|rank|lineage|id_lineage|rank_lineage)$).*', regex=True) class-attribute instance-attribute
id_lineage: Optional[Series[pd.CategoricalDtype]] = pa.Field() class-attribute instance-attribute
lineage: Optional[Series[pd.CategoricalDtype]] = pa.Field() class-attribute instance-attribute
name: Optional[Series[pd.CategoricalDtype]] = pa.Field() class-attribute instance-attribute
rank: Optional[Series[pd.CategoricalDtype]] = pa.Field() class-attribute instance-attribute
rank_lineage: Optional[Series[pd.CategoricalDtype]] = pa.Field() class-attribute instance-attribute
taxonomy_id: Series[pd.CategoricalDtype] = pa.Field() class-attribute instance-attribute

Configure the schema model.

Source code in src/taxpasta/domain/model/
class Config:
    """Configure the schema model."""

    coerce = True
    ordered = True
coerce = True class-attribute instance-attribute
ordered = True class-attribute instance-attribute






Provide a builder service for creating a consensus between many profiles.


Bases: ABC

Define a builder service for creating a consensus between many profiles.

Source code in src/taxpasta/domain/service/
class ConsensusBuilder(ABC):
    """Define a builder service for creating a consensus between many profiles."""

    def add_sample(self, sample: Sample) -> None:
        """Add a sample to the consensus builder."""

    def build(self) -> pd.DataFrame:
        """Build the consensus between all added profiles."""
add_sample(sample: Sample) -> None abstractmethod

Add a sample to the consensus builder.

Source code in src/taxpasta/domain/service/
def add_sample(self, sample: Sample) -> None:
    """Add a sample to the consensus builder."""
build() -> pd.DataFrame abstractmethod

Build the consensus between all added profiles.

Source code in src/taxpasta/domain/service/
def build(self) -> pd.DataFrame:
    """Build the consensus between all added profiles."""

Provide a sample merging service that summarizes two or more samples.


Define a sample merging service that summarizes one or more samples.

Source code in src/taxpasta/domain/service/
class SampleMergingService:
    """Define a sample merging service that summarizes one or more samples."""

    def merge_wide(cls, samples: Iterable[Sample]) -> DataFrame[WideObservationTable]:
        Merge two or more sample profiles into a wide-format observation matrix.

            samples: Two or more samples.

            A single table containing one row per taxon, one column for the taxonomy
            identifier, and one column per sample with abundance counts.

        # `set_index` creates a copy of the original profile which is convenient so that
        # we do not modify existing profiles but, of course, doubles the memory used.
        counts = [
                keys=StandardProfile.taxonomy_id, verify_integrity=True
            for sample in samples
        # Please note that `set_index` restores the underlying dtype of the categorical
        # column `taxonomy_id`. Thus, when we `reset_index` the column is of dtype
        # object but, due to schema coercion, this is automatically converted into a
        # categorical dtype again when pandera checks the return type.
        return (
            .join(counts[1:], how="outer")
            # We explicitly convert to int64 because of a Windows type problem.
            # See

    def merge_long(cls, samples: Iterable[Sample]) -> DataFrame[TidyObservationTable]:
        Merge two or more sample profiles into a tidy observation table.

            samples: Two or more samples.

            A single table containing three columns: taxonomy identifier, abundance
            count, and sample identifier.

        # `assign` creates a copy of the original profile which is convenient so that
        # we do not modify existing profiles but, of course, doubles the memory used.
        # Please note that `concat` restores the underlying dtype of the categorical
        # column `taxonomy_id`. Thus, the column is of dtype
        # object but, due to schema coercion, this is automatically converted into a
        # categorical dtype again when pandera checks the return type. The same holds
        # for the `sample` column.
        result = pd.concat(
            [sample.profile.assign( for sample in samples],
        # We explicitly convert to int64 because of a Windows type problem.
        # See
        result[TidyObservationTable.count] = result[TidyObservationTable.count].astype(
        return result
merge_long(samples: Iterable[Sample]) -> DataFrame[TidyObservationTable] classmethod

Merge two or more sample profiles into a tidy observation table.


Name Type Description Default
samples Iterable[Sample]

Two or more samples.



Type Description

A single table containing three columns: taxonomy identifier, abundance


count, and sample identifier.

Source code in src/taxpasta/domain/service/
def merge_long(cls, samples: Iterable[Sample]) -> DataFrame[TidyObservationTable]:
    Merge two or more sample profiles into a tidy observation table.

        samples: Two or more samples.

        A single table containing three columns: taxonomy identifier, abundance
        count, and sample identifier.

    # `assign` creates a copy of the original profile which is convenient so that
    # we do not modify existing profiles but, of course, doubles the memory used.
    # Please note that `concat` restores the underlying dtype of the categorical
    # column `taxonomy_id`. Thus, the column is of dtype
    # object but, due to schema coercion, this is automatically converted into a
    # categorical dtype again when pandera checks the return type. The same holds
    # for the `sample` column.
    result = pd.concat(
        [sample.profile.assign( for sample in samples],
    # We explicitly convert to int64 because of a Windows type problem.
    # See
    result[TidyObservationTable.count] = result[TidyObservationTable.count].astype(
    return result
merge_wide(samples: Iterable[Sample]) -> DataFrame[WideObservationTable] classmethod

Merge two or more sample profiles into a wide-format observation matrix.


Name Type Description Default
samples Iterable[Sample]

Two or more samples.



Type Description

A single table containing one row per taxon, one column for the taxonomy


identifier, and one column per sample with abundance counts.

Source code in src/taxpasta/domain/service/
def merge_wide(cls, samples: Iterable[Sample]) -> DataFrame[WideObservationTable]:
    Merge two or more sample profiles into a wide-format observation matrix.

        samples: Two or more samples.

        A single table containing one row per taxon, one column for the taxonomy
        identifier, and one column per sample with abundance counts.

    # `set_index` creates a copy of the original profile which is convenient so that
    # we do not modify existing profiles but, of course, doubles the memory used.
    counts = [
            keys=StandardProfile.taxonomy_id, verify_integrity=True
        for sample in samples
    # Please note that `set_index` restores the underlying dtype of the categorical
    # column `taxonomy_id`. Thus, when we `reset_index` the column is of dtype
    # object but, due to schema coercion, this is automatically converted into a
    # categorical dtype again when pandera checks the return type.
    return (
        .join(counts[1:], how="outer")
        # We explicitly convert to int64 because of a Windows type problem.
        # See

Provide an abstract taxonomy service interface.

ResultTable = TypeVar('ResultTable', TidyObservationTable, WideObservationTable, StandardProfile) module-attribute

Bases: ABC

Define the abstract taxonomy service interface.

Source code in src/taxpasta/domain/service/
class TaxonomyService(ABC):
    """Define the abstract taxonomy service interface."""

    def __init__(self, **kwargs) -> None:
        """Initialize a taxonomy service instance."""

    def get_taxon_name(self, taxonomy_id: int) -> Optional[str]:
        """Return the name of a given taxonomy identifier."""

    def get_taxon_rank(self, taxonomy_id: int) -> Optional[str]:
        """Return the rank of a given taxonomy identifier."""

    def get_taxon_name_lineage(self, taxonomy_id: int) -> Optional[List[str]]:
        """Return the lineage of a given taxonomy identifier as names."""

    def get_taxon_identifier_lineage(self, taxonomy_id: int) -> Optional[List[int]]:
        """Return the lineage of a given taxonomy identifier as identifiers."""

    def get_taxon_rank_lineage(self, taxonomy_id: int) -> Optional[List[str]]:
        """Return the lineage of a given taxonomy identifier as ranks."""

    def add_name(self, table: DataFrame[ResultTable]) -> DataFrame[ResultTable]:
        """Add a column for the taxon name to the given table."""

    def add_rank(self, table: DataFrame[ResultTable]) -> DataFrame[ResultTable]:
        """Add a column for the taxon rank to the given table."""

    def add_name_lineage(self, table: DataFrame[ResultTable]) -> DataFrame[ResultTable]:
        """Add a column for the taxon lineage to the given table."""

    def add_identifier_lineage(
        self, table: DataFrame[ResultTable]
    ) -> DataFrame[ResultTable]:
        """Add a column for the taxon lineage as identifiers to the given table."""

    def add_rank_lineage(self, table: DataFrame[ResultTable]) -> DataFrame[ResultTable]:
        """Add a column for the taxon lineage as ranks to the given table."""

    def format_biom_taxonomy(
        self, table: DataFrame[ResultTable]
    ) -> List[Dict[str, List[str]]]:
        """Format the taxonomy as BIOM observation metadata."""

    def summarise_at(
        self, profile: DataFrame[StandardProfile], rank: str
    ) -> DataFrame[StandardProfile]:
        """Summarise a standardised abundance profile at a higher taxonomic rank."""
__init__(**kwargs) -> None

Initialize a taxonomy service instance.

Source code in src/taxpasta/domain/service/
def __init__(self, **kwargs) -> None:
    """Initialize a taxonomy service instance."""
add_identifier_lineage(table: DataFrame[ResultTable]) -> DataFrame[ResultTable] abstractmethod

Add a column for the taxon lineage as identifiers to the given table.

Source code in src/taxpasta/domain/service/
def add_identifier_lineage(
    self, table: DataFrame[ResultTable]
) -> DataFrame[ResultTable]:
    """Add a column for the taxon lineage as identifiers to the given table."""
add_name(table: DataFrame[ResultTable]) -> DataFrame[ResultTable] abstractmethod

Add a column for the taxon name to the given table.

Source code in src/taxpasta/domain/service/
def add_name(self, table: DataFrame[ResultTable]) -> DataFrame[ResultTable]:
    """Add a column for the taxon name to the given table."""
add_name_lineage(table: DataFrame[ResultTable]) -> DataFrame[ResultTable] abstractmethod

Add a column for the taxon lineage to the given table.

Source code in src/taxpasta/domain/service/
def add_name_lineage(self, table: DataFrame[ResultTable]) -> DataFrame[ResultTable]:
    """Add a column for the taxon lineage to the given table."""
add_rank(table: DataFrame[ResultTable]) -> DataFrame[ResultTable] abstractmethod

Add a column for the taxon rank to the given table.

Source code in src/taxpasta/domain/service/
def add_rank(self, table: DataFrame[ResultTable]) -> DataFrame[ResultTable]:
    """Add a column for the taxon rank to the given table."""
add_rank_lineage(table: DataFrame[ResultTable]) -> DataFrame[ResultTable] abstractmethod

Add a column for the taxon lineage as ranks to the given table.

Source code in src/taxpasta/domain/service/
def add_rank_lineage(self, table: DataFrame[ResultTable]) -> DataFrame[ResultTable]:
    """Add a column for the taxon lineage as ranks to the given table."""
format_biom_taxonomy(table: DataFrame[ResultTable]) -> List[Dict[str, List[str]]] abstractmethod

Format the taxonomy as BIOM observation metadata.

Source code in src/taxpasta/domain/service/
def format_biom_taxonomy(
    self, table: DataFrame[ResultTable]
) -> List[Dict[str, List[str]]]:
    """Format the taxonomy as BIOM observation metadata."""
get_taxon_identifier_lineage(taxonomy_id: int) -> Optional[List[int]] abstractmethod

Return the lineage of a given taxonomy identifier as identifiers.

Source code in src/taxpasta/domain/service/
def get_taxon_identifier_lineage(self, taxonomy_id: int) -> Optional[List[int]]:
    """Return the lineage of a given taxonomy identifier as identifiers."""
get_taxon_name(taxonomy_id: int) -> Optional[str] abstractmethod

Return the name of a given taxonomy identifier.

Source code in src/taxpasta/domain/service/
def get_taxon_name(self, taxonomy_id: int) -> Optional[str]:
    """Return the name of a given taxonomy identifier."""
get_taxon_name_lineage(taxonomy_id: int) -> Optional[List[str]] abstractmethod

Return the lineage of a given taxonomy identifier as names.

Source code in src/taxpasta/domain/service/
def get_taxon_name_lineage(self, taxonomy_id: int) -> Optional[List[str]]:
    """Return the lineage of a given taxonomy identifier as names."""
get_taxon_rank(taxonomy_id: int) -> Optional[str] abstractmethod

Return the rank of a given taxonomy identifier.

Source code in src/taxpasta/domain/service/
def get_taxon_rank(self, taxonomy_id: int) -> Optional[str]:
    """Return the rank of a given taxonomy identifier."""
get_taxon_rank_lineage(taxonomy_id: int) -> Optional[List[str]] abstractmethod

Return the lineage of a given taxonomy identifier as ranks.

Source code in src/taxpasta/domain/service/
def get_taxon_rank_lineage(self, taxonomy_id: int) -> Optional[List[str]]:
    """Return the lineage of a given taxonomy identifier as ranks."""
summarise_at(profile: DataFrame[StandardProfile], rank: str) -> DataFrame[StandardProfile] abstractmethod

Summarise a standardised abundance profile at a higher taxonomic rank.

Source code in src/taxpasta/domain/service/
def summarise_at(
    self, profile: DataFrame[StandardProfile], rank: str
) -> DataFrame[StandardProfile]:
    """Summarise a standardised abundance profile at a higher taxonomic rank."""