Skip to content

βš™οΈ API reference

API structure

There are 2 ways of using polars-bio API:

  • directly on a Polars LazyFrame under a registered pb namespace

Example

 >>> type(df)
 <class 'polars.lazyframe.frame.LazyFrame'>
   import polars_bio as pb
   df.pb.sort().limit(5).collect()

  • using polars_bio module

Example

   import polars_bio as pb
   df = pb.read_table("https://www.encodeproject.org/files/ENCFF001XKR/@@download/ENCFF001XKR.bed.gz",schema="bed9")

Tip

  1. Not all are available in both ways.
  2. You can of course use both ways in the same script.

LazyFrame

Source code in polars_bio/polars_ext.py
@pl.api.register_lazyframe_namespace("pb")
class PolarsRangesOperations:
    def __init__(self, ldf: pl.LazyFrame) -> None:
        self._ldf = ldf

    def overlap(
        self,
        other_df: pl.LazyFrame,
        suffixes: tuple[str, str] = ("_1", "_2"),
        how="inner",
        overlap_filter=FilterOp.Strict,
        cols1=["chrom", "start", "end"],
        cols2=["chrom", "start", "end"],
    ) -> pl.LazyFrame:
        """
        !!! note
            Alias for [overlap](api.md#polars_bio.overlap)
        """
        return pb.overlap(
            self._ldf,
            other_df,
            how=how,
            overlap_filter=overlap_filter,
            suffixes=suffixes,
            cols1=cols1,
            cols2=cols2,
        )

    def nearest(
        self,
        other_df: pl.LazyFrame,
        suffixes: tuple[str, str] = ("_1", "_2"),
        overlap_filter=FilterOp.Strict,
        cols1=["chrom", "start", "end"],
        cols2=["chrom", "start", "end"],
    ) -> pl.LazyFrame:
        """
        !!! note
            Alias for [nearest](api.md#polars_bio.nearest)
        """
        return pb.nearest(
            self._ldf,
            other_df,
            overlap_filter=overlap_filter,
            suffixes=suffixes,
            cols1=cols1,
            cols2=cols2,
        )

    def sort(
        self, cols: Union[tuple[str], None] = ["chrom", "start", "end"]
    ) -> pl.LazyFrame:
        """
        Sort a bedframe.
        !!! note
            Adapted to Polars API from [bioframe.sort_bedframe](https://github.com/open2c/bioframe/blob/2b685eebef393c2c9e6220dcf550b3630d87518e/bioframe/ops.py#L1698)

        Parameters:
            cols: The names of columns containing the chromosome, start and end of the genomic intervals.


        !!! Example
              ```python
              import polars_bio as pb
              df = pb.read_table("https://www.encodeproject.org/files/ENCFF001XKR/@@download/ENCFF001XKR.bed.gz",schema="bed9")
              df.pb.sort().limit(5).collect()
              ```
                ```plaintext
                <class 'builtins.PyExpr'>
                shape: (5, 9)
                β”Œβ”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
                β”‚ chrom ┆ start   ┆ end     ┆ name ┆ … ┆ strand ┆ thickStart ┆ thickEnd ┆ itemRgb  β”‚
                β”‚ ---   ┆ ---     ┆ ---     ┆ ---  ┆   ┆ ---    ┆ ---        ┆ ---      ┆ ---      β”‚
                β”‚ str   ┆ i64     ┆ i64     ┆ str  ┆   ┆ str    ┆ str        ┆ str      ┆ str      β”‚
                β•žβ•β•β•β•β•β•β•β•ͺ═════════β•ͺ═════════β•ͺ══════β•ͺ═══β•ͺ════════β•ͺ════════════β•ͺ══════════β•ͺ══════════║
                β”‚ chr1  ┆ 193500  ┆ 194500  ┆ .    ┆ … ┆ +      ┆ .          ┆ .        ┆ 179,45,0 β”‚
                β”‚ chr1  ┆ 618500  ┆ 619500  ┆ .    ┆ … ┆ +      ┆ .          ┆ .        ┆ 179,45,0 β”‚
                β”‚ chr1  ┆ 974500  ┆ 975500  ┆ .    ┆ … ┆ +      ┆ .          ┆ .        ┆ 179,45,0 β”‚
                β”‚ chr1  ┆ 1301500 ┆ 1302500 ┆ .    ┆ … ┆ +      ┆ .          ┆ .        ┆ 179,45,0 β”‚
                β”‚ chr1  ┆ 1479500 ┆ 1480500 ┆ .    ┆ … ┆ +      ┆ .          ┆ .        ┆ 179,45,0 β”‚
                β””β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜

                ```

        """
        return self._ldf.sort(by=cols)

    def expand(
        self,
        pad: Union[int, None] = None,
        scale: Union[float, None] = None,
        side: str = "both",
        cols: Union[list[str], None] = ["chrom", "start", "end"],
    ) -> pl.LazyFrame:
        """
        Expand each interval by an amount specified with `pad`.
        !!! Note
            Adapted to Polars API from [bioframe.expand](https://github.com/open2c/bioframe/blob/2b685eebef393c2c9e6220dcf550b3630d87518e/bioframe/ops.py#L150)

        Negative values for pad shrink the interval, up to the midpoint.
        Multiplicative rescaling of intervals enabled with scale. Only one of pad
        or scale can be provided.

        Parameters:
            pad :
                The amount by which the intervals are additively expanded *on each side*.
                Negative values for pad shrink intervals, but not beyond the interval
                midpoint. Either `pad` or `scale` must be supplied.

            scale :
                The factor by which to scale intervals multiplicatively on each side, e.g
                ``scale=2`` doubles each interval, ``scale=0`` returns midpoints, and
                ``scale=1`` returns original intervals. Default False.
                Either `pad` or `scale` must be supplied.

            side :
                Which side to expand, possible values are 'left', 'right' and 'both'.
                Default 'both'.

            cols :
                The names of columns containing the chromosome, start and end of the
                genomic intervals. Default values are 'chrom', 'start', 'end'.


        """
        df = self._ldf
        ck, sk, ek = ["chrom", "start", "end"] if cols is None else cols
        padsk = "pads"
        midsk = "mids"

        if scale is not None and pad is not None:
            raise ValueError("only one of pad or scale can be supplied")
        elif scale is not None:
            if scale < 0:
                raise ValueError("multiplicative scale must be >=0")
            df = df.with_columns(
                [(0.5 * (scale - 1) * (pl.col(ek) - pl.col(sk))).alias(padsk)]
            )
        elif pad is not None:
            if not isinstance(pad, int):
                raise ValueError("additive pad must be integer")
            df = df.with_columns([pl.lit(pad).alias(padsk)])
        else:
            raise ValueError("either pad or scale must be supplied")
        if side == "both" or side == "left":
            df = df.with_columns([(pl.col(sk) - pl.col(padsk)).alias(sk)])
        if side == "both" or side == "right":
            df = df.with_columns([(pl.col(ek) + pl.col(padsk)).alias(ek)])

        if pad is not None:
            if pad < 0:
                df = df.with_columns(
                    [(pl.col(sk) + 0.5 * (pl.col(ek) - pl.col(sk))).alias(midsk)]
                )
                df = df.with_columns(
                    [
                        pl.min_horizontal(pl.col(sk), pl.col(midsk))
                        .cast(pl.Int64)
                        .alias(sk),
                        pl.max_horizontal(pl.col(ek), pl.col(midsk))
                        .cast(pl.Int64)
                        .alias(ek),
                    ]
                )
        if scale is not None:
            df = df.with_columns(
                [
                    pl.col(sk).round(0).cast(pl.Int64).alias(sk),
                    pl.col(ek).round(0).cast(pl.Int64).alias(ek),
                ]
            )
        schema = df.collect_schema().names()
        if padsk in schema:
            df = df.drop(padsk)
        if midsk in schema:
            df = df.drop(midsk)
        return df

expand(pad=None, scale=None, side='both', cols=['chrom', 'start', 'end'])

Expand each interval by an amount specified with pad.

Note

Adapted to Polars API from bioframe.expand

Negative values for pad shrink the interval, up to the midpoint. Multiplicative rescaling of intervals enabled with scale. Only one of pad or scale can be provided.

Parameters:

Name Type Description Default
pad

The amount by which the intervals are additively expanded on each side. Negative values for pad shrink intervals, but not beyond the interval midpoint. Either pad or scale must be supplied.

None
scale

The factor by which to scale intervals multiplicatively on each side, e.g scale=2 doubles each interval, scale=0 returns midpoints, and scale=1 returns original intervals. Default False. Either pad or scale must be supplied.

None
side

Which side to expand, possible values are 'left', 'right' and 'both'. Default 'both'.

'both'
cols

The names of columns containing the chromosome, start and end of the genomic intervals. Default values are 'chrom', 'start', 'end'.

['chrom', 'start', 'end']
Source code in polars_bio/polars_ext.py
def expand(
    self,
    pad: Union[int, None] = None,
    scale: Union[float, None] = None,
    side: str = "both",
    cols: Union[list[str], None] = ["chrom", "start", "end"],
) -> pl.LazyFrame:
    """
    Expand each interval by an amount specified with `pad`.
    !!! Note
        Adapted to Polars API from [bioframe.expand](https://github.com/open2c/bioframe/blob/2b685eebef393c2c9e6220dcf550b3630d87518e/bioframe/ops.py#L150)

    Negative values for pad shrink the interval, up to the midpoint.
    Multiplicative rescaling of intervals enabled with scale. Only one of pad
    or scale can be provided.

    Parameters:
        pad :
            The amount by which the intervals are additively expanded *on each side*.
            Negative values for pad shrink intervals, but not beyond the interval
            midpoint. Either `pad` or `scale` must be supplied.

        scale :
            The factor by which to scale intervals multiplicatively on each side, e.g
            ``scale=2`` doubles each interval, ``scale=0`` returns midpoints, and
            ``scale=1`` returns original intervals. Default False.
            Either `pad` or `scale` must be supplied.

        side :
            Which side to expand, possible values are 'left', 'right' and 'both'.
            Default 'both'.

        cols :
            The names of columns containing the chromosome, start and end of the
            genomic intervals. Default values are 'chrom', 'start', 'end'.


    """
    df = self._ldf
    ck, sk, ek = ["chrom", "start", "end"] if cols is None else cols
    padsk = "pads"
    midsk = "mids"

    if scale is not None and pad is not None:
        raise ValueError("only one of pad or scale can be supplied")
    elif scale is not None:
        if scale < 0:
            raise ValueError("multiplicative scale must be >=0")
        df = df.with_columns(
            [(0.5 * (scale - 1) * (pl.col(ek) - pl.col(sk))).alias(padsk)]
        )
    elif pad is not None:
        if not isinstance(pad, int):
            raise ValueError("additive pad must be integer")
        df = df.with_columns([pl.lit(pad).alias(padsk)])
    else:
        raise ValueError("either pad or scale must be supplied")
    if side == "both" or side == "left":
        df = df.with_columns([(pl.col(sk) - pl.col(padsk)).alias(sk)])
    if side == "both" or side == "right":
        df = df.with_columns([(pl.col(ek) + pl.col(padsk)).alias(ek)])

    if pad is not None:
        if pad < 0:
            df = df.with_columns(
                [(pl.col(sk) + 0.5 * (pl.col(ek) - pl.col(sk))).alias(midsk)]
            )
            df = df.with_columns(
                [
                    pl.min_horizontal(pl.col(sk), pl.col(midsk))
                    .cast(pl.Int64)
                    .alias(sk),
                    pl.max_horizontal(pl.col(ek), pl.col(midsk))
                    .cast(pl.Int64)
                    .alias(ek),
                ]
            )
    if scale is not None:
        df = df.with_columns(
            [
                pl.col(sk).round(0).cast(pl.Int64).alias(sk),
                pl.col(ek).round(0).cast(pl.Int64).alias(ek),
            ]
        )
    schema = df.collect_schema().names()
    if padsk in schema:
        df = df.drop(padsk)
    if midsk in schema:
        df = df.drop(midsk)
    return df

nearest(other_df, suffixes=('_1', '_2'), overlap_filter=FilterOp.Strict, cols1=['chrom', 'start', 'end'], cols2=['chrom', 'start', 'end'])

Note

Alias for nearest

Source code in polars_bio/polars_ext.py
def nearest(
    self,
    other_df: pl.LazyFrame,
    suffixes: tuple[str, str] = ("_1", "_2"),
    overlap_filter=FilterOp.Strict,
    cols1=["chrom", "start", "end"],
    cols2=["chrom", "start", "end"],
) -> pl.LazyFrame:
    """
    !!! note
        Alias for [nearest](api.md#polars_bio.nearest)
    """
    return pb.nearest(
        self._ldf,
        other_df,
        overlap_filter=overlap_filter,
        suffixes=suffixes,
        cols1=cols1,
        cols2=cols2,
    )

overlap(other_df, suffixes=('_1', '_2'), how='inner', overlap_filter=FilterOp.Strict, cols1=['chrom', 'start', 'end'], cols2=['chrom', 'start', 'end'])

Note

Alias for overlap

Source code in polars_bio/polars_ext.py
def overlap(
    self,
    other_df: pl.LazyFrame,
    suffixes: tuple[str, str] = ("_1", "_2"),
    how="inner",
    overlap_filter=FilterOp.Strict,
    cols1=["chrom", "start", "end"],
    cols2=["chrom", "start", "end"],
) -> pl.LazyFrame:
    """
    !!! note
        Alias for [overlap](api.md#polars_bio.overlap)
    """
    return pb.overlap(
        self._ldf,
        other_df,
        how=how,
        overlap_filter=overlap_filter,
        suffixes=suffixes,
        cols1=cols1,
        cols2=cols2,
    )

sort(cols=['chrom', 'start', 'end'])

Sort a bedframe.

Note

Adapted to Polars API from bioframe.sort_bedframe

Parameters:

Name Type Description Default
cols Union[tuple[str], None]

The names of columns containing the chromosome, start and end of the genomic intervals.

['chrom', 'start', 'end']

Example

import polars_bio as pb
df = pb.read_table("https://www.encodeproject.org/files/ENCFF001XKR/@@download/ENCFF001XKR.bed.gz",schema="bed9")
df.pb.sort().limit(5).collect()
<class 'builtins.PyExpr'>
shape: (5, 9)
β”Œβ”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
β”‚ chrom ┆ start   ┆ end     ┆ name ┆ … ┆ strand ┆ thickStart ┆ thickEnd ┆ itemRgb  β”‚
β”‚ ---   ┆ ---     ┆ ---     ┆ ---  ┆   ┆ ---    ┆ ---        ┆ ---      ┆ ---      β”‚
β”‚ str   ┆ i64     ┆ i64     ┆ str  ┆   ┆ str    ┆ str        ┆ str      ┆ str      β”‚
β•žβ•β•β•β•β•β•β•β•ͺ═════════β•ͺ═════════β•ͺ══════β•ͺ═══β•ͺ════════β•ͺ════════════β•ͺ══════════β•ͺ══════════║
β”‚ chr1  ┆ 193500  ┆ 194500  ┆ .    ┆ … ┆ +      ┆ .          ┆ .        ┆ 179,45,0 β”‚
β”‚ chr1  ┆ 618500  ┆ 619500  ┆ .    ┆ … ┆ +      ┆ .          ┆ .        ┆ 179,45,0 β”‚
β”‚ chr1  ┆ 974500  ┆ 975500  ┆ .    ┆ … ┆ +      ┆ .          ┆ .        ┆ 179,45,0 β”‚
β”‚ chr1  ┆ 1301500 ┆ 1302500 ┆ .    ┆ … ┆ +      ┆ .          ┆ .        ┆ 179,45,0 β”‚
β”‚ chr1  ┆ 1479500 ┆ 1480500 ┆ .    ┆ … ┆ +      ┆ .          ┆ .        ┆ 179,45,0 β”‚
β””β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜

Source code in polars_bio/polars_ext.py
def sort(
    self, cols: Union[tuple[str], None] = ["chrom", "start", "end"]
) -> pl.LazyFrame:
    """
    Sort a bedframe.
    !!! note
        Adapted to Polars API from [bioframe.sort_bedframe](https://github.com/open2c/bioframe/blob/2b685eebef393c2c9e6220dcf550b3630d87518e/bioframe/ops.py#L1698)

    Parameters:
        cols: The names of columns containing the chromosome, start and end of the genomic intervals.


    !!! Example
          ```python
          import polars_bio as pb
          df = pb.read_table("https://www.encodeproject.org/files/ENCFF001XKR/@@download/ENCFF001XKR.bed.gz",schema="bed9")
          df.pb.sort().limit(5).collect()
          ```
            ```plaintext
            <class 'builtins.PyExpr'>
            shape: (5, 9)
            β”Œβ”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
            β”‚ chrom ┆ start   ┆ end     ┆ name ┆ … ┆ strand ┆ thickStart ┆ thickEnd ┆ itemRgb  β”‚
            β”‚ ---   ┆ ---     ┆ ---     ┆ ---  ┆   ┆ ---    ┆ ---        ┆ ---      ┆ ---      β”‚
            β”‚ str   ┆ i64     ┆ i64     ┆ str  ┆   ┆ str    ┆ str        ┆ str      ┆ str      β”‚
            β•žβ•β•β•β•β•β•β•β•ͺ═════════β•ͺ═════════β•ͺ══════β•ͺ═══β•ͺ════════β•ͺ════════════β•ͺ══════════β•ͺ══════════║
            β”‚ chr1  ┆ 193500  ┆ 194500  ┆ .    ┆ … ┆ +      ┆ .          ┆ .        ┆ 179,45,0 β”‚
            β”‚ chr1  ┆ 618500  ┆ 619500  ┆ .    ┆ … ┆ +      ┆ .          ┆ .        ┆ 179,45,0 β”‚
            β”‚ chr1  ┆ 974500  ┆ 975500  ┆ .    ┆ … ┆ +      ┆ .          ┆ .        ┆ 179,45,0 β”‚
            β”‚ chr1  ┆ 1301500 ┆ 1302500 ┆ .    ┆ … ┆ +      ┆ .          ┆ .        ┆ 179,45,0 β”‚
            β”‚ chr1  ┆ 1479500 ┆ 1480500 ┆ .    ┆ … ┆ +      ┆ .          ┆ .        ┆ 179,45,0 β”‚
            β””β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜

            ```

    """
    return self._ldf.sort(by=cols)

nearest(df1, df2, overlap_filter=FilterOp.Strict, suffixes=('_1', '_2'), on_cols=None, cols1=['chrom', 'start', 'end'], cols2=['chrom', 'start', 'end'], output_type='polars.LazyFrame', streaming=False)

Find pairs of overlapping genomic intervals. Bioframe inspired API.

Parameters:

Name Type Description Default
df1 Union[str, DataFrame, LazyFrame, DataFrame]

Can be a path to a file, a polars DataFrame, or a pandas DataFrame. CSV with a header, BED and Parquet are supported.

required
df2 Union[str, DataFrame, LazyFrame, DataFrame]

Can be a path to a file, a polars DataFrame, or a pandas DataFrame. CSV with a header, BED and Parquet are supported.

required
overlap_filter FilterOp

FilterOp, optional. The type of overlap to consider(Weak or Strict).

Strict
cols1 Union[list[str], None]

The names of columns containing the chromosome, start and end of the genomic intervals, provided separately for each set.

['chrom', 'start', 'end']
cols2 Union[list[str], None]

The names of columns containing the chromosome, start and end of the genomic intervals, provided separately for each set.

['chrom', 'start', 'end']
suffixes tuple[str, str]

Suffixes for the columns of the two overlapped sets.

('_1', '_2')
on_cols Union[list[str], None]

List of additional column names to join on. default is None.

None
output_type str

Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" are also supported.

'polars.LazyFrame'
streaming bool

EXPERIMENTAL If True, use Polars streaming engine.

False

Returns:

Type Description
Union[LazyFrame, DataFrame, DataFrame]

polars.LazyFrame or polars.DataFrame or pandas.DataFrame of the overlapping intervals.

Note

The default output format, i.e. LazyFrame, is recommended for large datasets as it supports output streaming and lazy evaluation. This enables efficient processing of large datasets without loading the entire output dataset into memory.

Example:

Todo

Support for on_cols.

Source code in polars_bio/range_op.py
def nearest(
    df1: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
    df2: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
    overlap_filter: FilterOp = FilterOp.Strict,
    suffixes: tuple[str, str] = ("_1", "_2"),
    on_cols: Union[list[str], None] = None,
    cols1: Union[list[str], None] = ["chrom", "start", "end"],
    cols2: Union[list[str], None] = ["chrom", "start", "end"],
    output_type: str = "polars.LazyFrame",
    streaming: bool = False,
) -> Union[pl.LazyFrame, pl.DataFrame, pd.DataFrame]:
    """
    Find pairs of overlapping genomic intervals.
    Bioframe inspired API.

    Parameters:
        df1: Can be a path to a file, a polars DataFrame, or a pandas DataFrame. CSV with a header, BED  and Parquet are supported.
        df2: Can be a path to a file, a polars DataFrame, or a pandas DataFrame. CSV with a header, BED  and Parquet are supported.
        overlap_filter: FilterOp, optional. The type of overlap to consider(Weak or Strict).
        cols1: The names of columns containing the chromosome, start and end of the
            genomic intervals, provided separately for each set.
        cols2:  The names of columns containing the chromosome, start and end of the
            genomic intervals, provided separately for each set.
        suffixes: Suffixes for the columns of the two overlapped sets.
        on_cols: List of additional column names to join on. default is None.
        output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" are also supported.
        streaming: **EXPERIMENTAL** If True, use Polars [streaming](features.md#streaming-out-of-core-processing) engine.

    Returns:
        **polars.LazyFrame** or polars.DataFrame or pandas.DataFrame of the overlapping intervals.

    Note:
        The default output format, i.e. [LazyFrame](https://docs.pola.rs/api/python/stable/reference/lazyframe/index.html), is recommended for large datasets as it supports output streaming and lazy evaluation.
        This enables efficient processing of large datasets without loading the entire output dataset into memory.

    Example:

    Todo:
        Support for on_cols.
    """

    _validate_overlap_input(cols1, cols2, on_cols, suffixes, output_type, how="inner")

    cols1 = DEFAULT_INTERVAL_COLUMNS if cols1 is None else cols1
    cols2 = DEFAULT_INTERVAL_COLUMNS if cols2 is None else cols2
    range_options = RangeOptions(
        range_op=RangeOp.Nearest,
        filter_op=overlap_filter,
        suffixes=suffixes,
        columns_1=cols1,
        columns_2=cols2,
        streaming=streaming,
    )
    return range_operation(df1, df2, range_options, output_type, ctx)

overlap(df1, df2, how='inner', overlap_filter=FilterOp.Strict, suffixes=('_1', '_2'), on_cols=None, cols1=['chrom', 'start', 'end'], cols2=['chrom', 'start', 'end'], algorithm='Coitrees', output_type='polars.LazyFrame', streaming=False)

Find pairs of overlapping genomic intervals. Bioframe inspired API.

Parameters:

Name Type Description Default
df1 Union[str, DataFrame, LazyFrame, DataFrame]

Can be a path to a file, a polars DataFrame, or a pandas DataFrame. CSV with a header, BED and Parquet are supported.

required
df2 Union[str, DataFrame, LazyFrame, DataFrame]

Can be a path to a file, a polars DataFrame, or a pandas DataFrame. CSV with a header, BED and Parquet are supported.

required
how str

How to handle the overlaps on the two dataframes. inner: use intersection of the set of intervals from df1 and df2, optional.

'inner'
overlap_filter FilterOp

FilterOp, optional. The type of overlap to consider(Weak or Strict).

Strict
cols1 Union[list[str], None]

The names of columns containing the chromosome, start and end of the genomic intervals, provided separately for each set.

['chrom', 'start', 'end']
cols2 Union[list[str], None]

The names of columns containing the chromosome, start and end of the genomic intervals, provided separately for each set.

['chrom', 'start', 'end']
suffixes tuple[str, str]

Suffixes for the columns of the two overlapped sets.

('_1', '_2')
on_cols Union[list[str], None]

List of additional column names to join on. default is None.

None
algorithm str

The algorithm to use for the overlap operation.

'Coitrees'
output_type str

Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" are also supported.

'polars.LazyFrame'
streaming bool

EXPERIMENTAL If True, use Polars streaming engine.

False

Returns:

Type Description
Union[LazyFrame, DataFrame, DataFrame]

polars.LazyFrame or polars.DataFrame or pandas.DataFrame of the overlapping intervals.

Note
  1. The default output format, i.e. LazyFrame, is recommended for large datasets as it supports output streaming and lazy evaluation. This enables efficient processing of large datasets without loading the entire output dataset into memory.
  2. Streaming is only supported for polars.LazyFrame output.
Example
import polars_bio as pb
import pandas as pd

df1 = pd.DataFrame([
    ['chr1', 1, 5],
    ['chr1', 3, 8],
    ['chr1', 8, 10],
    ['chr1', 12, 14]],
columns=['chrom', 'start', 'end']
)

df2 = pd.DataFrame(
[['chr1', 4, 8],
 ['chr1', 10, 11]],
columns=['chrom', 'start', 'end' ]
)
overlapping_intervals = pb.overlap(df1, df2, output_type="pandas.DataFrame")

overlapping_intervals
    chrom_1         start_1     end_1 chrom_2       start_2  end_2
0     chr1            1          5     chr1            4          8
1     chr1            3          8     chr1            4          8
Todo

Support for on_cols.

Source code in polars_bio/range_op.py
def overlap(
    df1: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
    df2: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
    how: str = "inner",
    overlap_filter: FilterOp = FilterOp.Strict,
    suffixes: tuple[str, str] = ("_1", "_2"),
    on_cols: Union[list[str], None] = None,
    cols1: Union[list[str], None] = ["chrom", "start", "end"],
    cols2: Union[list[str], None] = ["chrom", "start", "end"],
    algorithm: str = "Coitrees",
    output_type: str = "polars.LazyFrame",
    streaming: bool = False,
) -> Union[pl.LazyFrame, pl.DataFrame, pd.DataFrame]:
    """
    Find pairs of overlapping genomic intervals.
    Bioframe inspired API.

    Parameters:
        df1: Can be a path to a file, a polars DataFrame, or a pandas DataFrame. CSV with a header, BED and Parquet are supported.
        df2: Can be a path to a file, a polars DataFrame, or a pandas DataFrame. CSV with a header, BED  and Parquet are supported.
        how: How to handle the overlaps on the two dataframes. inner: use intersection of the set of intervals from df1 and df2, optional.
        overlap_filter: FilterOp, optional. The type of overlap to consider(Weak or Strict).
        cols1: The names of columns containing the chromosome, start and end of the
            genomic intervals, provided separately for each set.
        cols2:  The names of columns containing the chromosome, start and end of the
            genomic intervals, provided separately for each set.
        suffixes: Suffixes for the columns of the two overlapped sets.
        on_cols: List of additional column names to join on. default is None.
        algorithm: The algorithm to use for the overlap operation.
        output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" are also supported.
        streaming: **EXPERIMENTAL** If True, use Polars [streaming](features.md#streaming-out-of-core-processing) engine.

    Returns:
        **polars.LazyFrame** or polars.DataFrame or pandas.DataFrame of the overlapping intervals.

    Note:
        1. The default output format, i.e.  [LazyFrame](https://docs.pola.rs/api/python/stable/reference/lazyframe/index.html), is recommended for large datasets as it supports output streaming and lazy evaluation.
        This enables efficient processing of large datasets without loading the entire output dataset into memory.
        2. Streaming is only supported for polars.LazyFrame output.

    Example:
        ```python
        import polars_bio as pb
        import pandas as pd

        df1 = pd.DataFrame([
            ['chr1', 1, 5],
            ['chr1', 3, 8],
            ['chr1', 8, 10],
            ['chr1', 12, 14]],
        columns=['chrom', 'start', 'end']
        )

        df2 = pd.DataFrame(
        [['chr1', 4, 8],
         ['chr1', 10, 11]],
        columns=['chrom', 'start', 'end' ]
        )
        overlapping_intervals = pb.overlap(df1, df2, output_type="pandas.DataFrame")

        overlapping_intervals
            chrom_1         start_1     end_1 chrom_2       start_2  end_2
        0     chr1            1          5     chr1            4          8
        1     chr1            3          8     chr1            4          8

        ```

    Todo:
         Support for on_cols.
    """

    _validate_overlap_input(cols1, cols2, on_cols, suffixes, output_type, how)

    cols1 = DEFAULT_INTERVAL_COLUMNS if cols1 is None else cols1
    cols2 = DEFAULT_INTERVAL_COLUMNS if cols2 is None else cols2
    range_options = RangeOptions(
        range_op=RangeOp.Overlap,
        filter_op=overlap_filter,
        suffixes=suffixes,
        columns_1=cols1,
        columns_2=cols2,
        overlap_alg=algorithm,
        streaming=streaming,
    )
    return range_operation(df1, df2, range_options, output_type, ctx)

read_bam(path)

Read a BAM file into a LazyFrame.

Parameters:

Name Type Description Default
path str

The path to the BAM file.

required
Source code in polars_bio/io.py
def read_bam(path: str) -> pl.LazyFrame:
    """
    Read a BAM file into a LazyFrame.

    Parameters:
        path: The path to the BAM file.
    """
    return file_lazy_scan(path, InputFormat.Bam)

read_fasta(path)

Read a FASTA file into a LazyFrame.

Parameters:

Name Type Description Default
path str

The path to the FASTA file.

required
Source code in polars_bio/io.py
def read_fasta(path: str) -> pl.LazyFrame:
    """
    Read a FASTA file into a LazyFrame.

    Parameters:
        path: The path to the FASTA file.
    """
    return file_lazy_scan(path, InputFormat.Fasta)

read_fastq(path)

Read a FASTQ file into a LazyFrame.

Parameters:

Name Type Description Default
path str

The path to the FASTQ file.

required
Source code in polars_bio/io.py
def read_fastq(path: str) -> pl.LazyFrame:
    """
    Read a FASTQ file into a LazyFrame.

    Parameters:
        path: The path to the FASTQ file.
    """
    return file_lazy_scan(path, InputFormat.Fastq)

read_table(path, schema=None, **kwargs)

Read a tab-delimited (i.e. BED) file into a Polars LazyFrame. Tries to be compatible with Bioframe's read_table but faster and lazy. Schema should follow the Bioframe's schema format.

Parameters:

Name Type Description Default
path str

The path to the file.

required
schema Dict

Schema should follow the Bioframe's schema format.

None
Source code in polars_bio/io.py
def read_table(path: str, schema: Dict = None, **kwargs) -> pl.LazyFrame:
    """
     Read a tab-delimited (i.e. BED) file into a Polars LazyFrame.
     Tries to be compatible with Bioframe's [read_table](https://bioframe.readthedocs.io/en/latest/guide-io.html)
     but faster and lazy. Schema should follow the Bioframe's schema [format](https://github.com/open2c/bioframe/blob/2b685eebef393c2c9e6220dcf550b3630d87518e/bioframe/io/schemas.py#L174).

    Parameters:
        path: The path to the file.
        schema: Schema should follow the Bioframe's schema [format](https://github.com/open2c/bioframe/blob/2b685eebef393c2c9e6220dcf550b3630d87518e/bioframe/io/schemas.py#L174).


    """
    df = pl.scan_csv(path, separator="\t", has_header=False, **kwargs)
    if schema is not None:
        columns = SCHEMAS[schema]
        if len(columns) != len(df.collect_schema()):
            raise ValueError(
                f"Schema incompatible with the input. Expected {len(columns)} columns in a schema, got {len(df.collect_schema())} in the input data file. Please provide a valid schema."
            )
        for i, c in enumerate(columns):
            df = df.rename({f"column_{i+1}": c})
    return df

read_vcf(path)

Read a VCF file into a LazyFrame.

Parameters:

Name Type Description Default
path str

The path to the VCF file.

required
Source code in polars_bio/io.py
def read_vcf(path: str) -> pl.LazyFrame:
    """
    Read a VCF file into a LazyFrame.

    Parameters:
        path: The path to the VCF file.
    """
    return file_lazy_scan(path, InputFormat.Vcf)

visualize_intervals(df, label='overlapping pair')

Visualize the overlapping intervals.

Parameters:

Name Type Description Default
df Union[DataFrame, DataFrame]

Pandas DataFrame or Polars DataFrame. The DataFrame containing the overlapping intervals

required
label str

TBD

'overlapping pair'
Source code in polars_bio/range_viz.py
def visualize_intervals(
    df: Union[pd.DataFrame, pl.DataFrame], label: str = "overlapping pair"
) -> None:
    """
    Visualize the overlapping intervals.

    Parameters:
        df: Pandas DataFrame or Polars DataFrame. The DataFrame containing the overlapping intervals
        label: TBD

    """
    assert isinstance(
        df, (pd.DataFrame, pl.DataFrame)
    ), "df must be a Pandas or Polars DataFrame"
    df = df if isinstance(df, pd.DataFrame) else df.to_pandas()
    for i, reg_pair in df.iterrows():
        bf.vis.plot_intervals_arr(
            starts=[reg_pair.start_1, reg_pair.start_2],
            ends=[reg_pair.end_1, reg_pair.end_2],
            colors=["skyblue", "lightpink"],
            levels=[2, 1],
            xlim=(0, 16),
            show_coords=True,
        )
        plt.title(f"{label} #{i}")