Skip to content

βš™οΈ API reference

API structure

There are 2 ways of using polars-bio API:

  • directly on a Polars LazyFrame under a registered pb namespace

Example

 >>> type(df)
 <class 'polars.lazyframe.frame.LazyFrame'>
   import polars_bio as pb
   df.pb.sort().limit(5).collect()
  • using polars_bio module

Example

   import polars_bio as pb
   df = pb.read_table("https://www.encodeproject.org/files/ENCFF001XKR/@@download/ENCFF001XKR.bed.gz",schema="bed9")

Tip

  1. Not all are available in both ways.
  2. You can of course use both ways in the same script.

LazyFrame

Source code in polars_bio/polars_ext.py
@pl.api.register_lazyframe_namespace("pb")
class PolarsRangesOperations:
    def __init__(self, ldf: pl.LazyFrame) -> None:
        self._ldf = ldf

    def overlap(
        self,
        other_df: pl.LazyFrame,
        suffixes: tuple[str, str] = ("_1", "_2"),
        how="inner",
        overlap_filter=FilterOp.Strict,
        cols1=["chrom", "start", "end"],
        cols2=["chrom", "start", "end"],
    ) -> pl.LazyFrame:
        """
        !!! note
            Alias for [overlap](api.md#polars_bio.overlap)
        """
        return pb.overlap(
            self._ldf,
            other_df,
            how=how,
            overlap_filter=overlap_filter,
            suffixes=suffixes,
            cols1=cols1,
            cols2=cols2,
        )

    def nearest(
        self,
        other_df: pl.LazyFrame,
        suffixes: tuple[str, str] = ("_1", "_2"),
        overlap_filter=FilterOp.Strict,
        cols1=["chrom", "start", "end"],
        cols2=["chrom", "start", "end"],
    ) -> pl.LazyFrame:
        """
        !!! note
            Alias for [nearest](api.md#polars_bio.nearest)
        """
        return pb.nearest(
            self._ldf,
            other_df,
            overlap_filter=overlap_filter,
            suffixes=suffixes,
            cols1=cols1,
            cols2=cols2,
        )

    def count_overlaps(
        self,
        other_df: pl.LazyFrame,
        overlap_filter=FilterOp.Strict,
        suffixes: tuple[str, str] = ("", "_"),
        cols1=["chrom", "start", "end"],
        cols2=["chrom", "start", "end"],
        on_cols: Union[list[str], None] = None,
        naive_query: bool = True,
    ) -> pl.LazyFrame:
        """
        !!! note
            Alias for [count_overlaps](api.md#polars_bio.count_overlaps)
        """
        return pb.count_overlaps(
            self._ldf,
            other_df,
            overlap_filter=overlap_filter,
            suffixes=suffixes,
            cols1=cols1,
            cols2=cols2,
            on_cols=on_cols,
            naive_query=naive_query,
        )

    def merge(
        self,
        overlap_filter: FilterOp = FilterOp.Strict,
        min_dist: float = 0,
        cols: Union[list[str], None] = None,
    ) -> pl.LazyFrame:
        """
        !!! note
            Alias for [merge](api.md#polars_bio.merge)
        """
        return pb.merge(
            self._ldf, overlap_filter=overlap_filter, min_dist=min_dist, cols=cols
        )

    def sort(
        self, cols: Union[tuple[str], None] = ["chrom", "start", "end"]
    ) -> pl.LazyFrame:
        """
        Sort a bedframe.
        !!! note
            Adapted to Polars API from [bioframe.sort_bedframe](https://github.com/open2c/bioframe/blob/2b685eebef393c2c9e6220dcf550b3630d87518e/bioframe/ops.py#L1698)

        Parameters:
            cols: The names of columns containing the chromosome, start and end of the genomic intervals.


        !!! Example
              ```python
              import polars_bio as pb
              df = pb.read_table("https://www.encodeproject.org/files/ENCFF001XKR/@@download/ENCFF001XKR.bed.gz",schema="bed9")
              df.pb.sort().limit(5).collect()
              ```
                ```plaintext
                <class 'builtins.PyExpr'>
                shape: (5, 9)
                β”Œβ”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
                β”‚ chrom ┆ start   ┆ end     ┆ name ┆ … ┆ strand ┆ thickStart ┆ thickEnd ┆ itemRgb  β”‚
                β”‚ ---   ┆ ---     ┆ ---     ┆ ---  ┆   ┆ ---    ┆ ---        ┆ ---      ┆ ---      β”‚
                β”‚ str   ┆ i64     ┆ i64     ┆ str  ┆   ┆ str    ┆ str        ┆ str      ┆ str      β”‚
                β•žβ•β•β•β•β•β•β•β•ͺ═════════β•ͺ═════════β•ͺ══════β•ͺ═══β•ͺ════════β•ͺ════════════β•ͺ══════════β•ͺ══════════║
                β”‚ chr1  ┆ 193500  ┆ 194500  ┆ .    ┆ … ┆ +      ┆ .          ┆ .        ┆ 179,45,0 β”‚
                β”‚ chr1  ┆ 618500  ┆ 619500  ┆ .    ┆ … ┆ +      ┆ .          ┆ .        ┆ 179,45,0 β”‚
                β”‚ chr1  ┆ 974500  ┆ 975500  ┆ .    ┆ … ┆ +      ┆ .          ┆ .        ┆ 179,45,0 β”‚
                β”‚ chr1  ┆ 1301500 ┆ 1302500 ┆ .    ┆ … ┆ +      ┆ .          ┆ .        ┆ 179,45,0 β”‚
                β”‚ chr1  ┆ 1479500 ┆ 1480500 ┆ .    ┆ … ┆ +      ┆ .          ┆ .        ┆ 179,45,0 β”‚
                β””β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜

                ```

        """
        return self._ldf.sort(by=cols)

    def expand(
        self,
        pad: Union[int, None] = None,
        scale: Union[float, None] = None,
        side: str = "both",
        cols: Union[list[str], None] = ["chrom", "start", "end"],
    ) -> pl.LazyFrame:
        """
        Expand each interval by an amount specified with `pad`.
        !!! Note
            Adapted to Polars API from [bioframe.expand](https://github.com/open2c/bioframe/blob/2b685eebef393c2c9e6220dcf550b3630d87518e/bioframe/ops.py#L150)

        Negative values for pad shrink the interval, up to the midpoint.
        Multiplicative rescaling of intervals enabled with scale. Only one of pad
        or scale can be provided.

        Parameters:
            pad :
                The amount by which the intervals are additively expanded *on each side*.
                Negative values for pad shrink intervals, but not beyond the interval
                midpoint. Either `pad` or `scale` must be supplied.

            scale :
                The factor by which to scale intervals multiplicatively on each side, e.g
                ``scale=2`` doubles each interval, ``scale=0`` returns midpoints, and
                ``scale=1`` returns original intervals. Default False.
                Either `pad` or `scale` must be supplied.

            side :
                Which side to expand, possible values are 'left', 'right' and 'both'.
                Default 'both'.

            cols :
                The names of columns containing the chromosome, start and end of the
                genomic intervals. Default values are 'chrom', 'start', 'end'.


        """
        df = self._ldf
        ck, sk, ek = ["chrom", "start", "end"] if cols is None else cols
        padsk = "pads"
        midsk = "mids"

        if scale is not None and pad is not None:
            raise ValueError("only one of pad or scale can be supplied")
        elif scale is not None:
            if scale < 0:
                raise ValueError("multiplicative scale must be >=0")
            df = df.with_columns(
                [(0.5 * (scale - 1) * (pl.col(ek) - pl.col(sk))).alias(padsk)]
            )
        elif pad is not None:
            if not isinstance(pad, int):
                raise ValueError("additive pad must be integer")
            df = df.with_columns([pl.lit(pad).alias(padsk)])
        else:
            raise ValueError("either pad or scale must be supplied")
        if side == "both" or side == "left":
            df = df.with_columns([(pl.col(sk) - pl.col(padsk)).alias(sk)])
        if side == "both" or side == "right":
            df = df.with_columns([(pl.col(ek) + pl.col(padsk)).alias(ek)])

        if pad is not None:
            if pad < 0:
                df = df.with_columns(
                    [(pl.col(sk) + 0.5 * (pl.col(ek) - pl.col(sk))).alias(midsk)]
                )
                df = df.with_columns(
                    [
                        pl.min_horizontal(pl.col(sk), pl.col(midsk))
                        .cast(pl.Int64)
                        .alias(sk),
                        pl.max_horizontal(pl.col(ek), pl.col(midsk))
                        .cast(pl.Int64)
                        .alias(ek),
                    ]
                )
        if scale is not None:
            df = df.with_columns(
                [
                    pl.col(sk).round(0).cast(pl.Int64).alias(sk),
                    pl.col(ek).round(0).cast(pl.Int64).alias(ek),
                ]
            )
        schema = df.collect_schema().names()
        if padsk in schema:
            df = df.drop(padsk)
        if midsk in schema:
            df = df.drop(midsk)
        return df

    def coverage(
        self,
        other_df: pl.LazyFrame,
        cols1=["chrom", "start", "end"],
        cols2=["chrom", "start", "end"],
        suffixes: tuple[str, str] = ("_1", "_2"),
    ) -> pl.LazyFrame:
        """
        !!! note
            Alias for [coverage](api.md#polars_bio.coverage)
        """
        return pb.coverage(
            self._ldf, other_df, cols1=cols1, cols2=cols2, suffixes=suffixes
        )

count_overlaps(other_df, overlap_filter=FilterOp.Strict, suffixes=('', '_'), cols1=['chrom', 'start', 'end'], cols2=['chrom', 'start', 'end'], on_cols=None, naive_query=True)

Note

Alias for count_overlaps

Source code in polars_bio/polars_ext.py
def count_overlaps(
    self,
    other_df: pl.LazyFrame,
    overlap_filter=FilterOp.Strict,
    suffixes: tuple[str, str] = ("", "_"),
    cols1=["chrom", "start", "end"],
    cols2=["chrom", "start", "end"],
    on_cols: Union[list[str], None] = None,
    naive_query: bool = True,
) -> pl.LazyFrame:
    """
    !!! note
        Alias for [count_overlaps](api.md#polars_bio.count_overlaps)
    """
    return pb.count_overlaps(
        self._ldf,
        other_df,
        overlap_filter=overlap_filter,
        suffixes=suffixes,
        cols1=cols1,
        cols2=cols2,
        on_cols=on_cols,
        naive_query=naive_query,
    )

coverage(other_df, cols1=['chrom', 'start', 'end'], cols2=['chrom', 'start', 'end'], suffixes=('_1', '_2'))

Note

Alias for coverage

Source code in polars_bio/polars_ext.py
def coverage(
    self,
    other_df: pl.LazyFrame,
    cols1=["chrom", "start", "end"],
    cols2=["chrom", "start", "end"],
    suffixes: tuple[str, str] = ("_1", "_2"),
) -> pl.LazyFrame:
    """
    !!! note
        Alias for [coverage](api.md#polars_bio.coverage)
    """
    return pb.coverage(
        self._ldf, other_df, cols1=cols1, cols2=cols2, suffixes=suffixes
    )

expand(pad=None, scale=None, side='both', cols=['chrom', 'start', 'end'])

Expand each interval by an amount specified with pad.

Note

Adapted to Polars API from bioframe.expand

Negative values for pad shrink the interval, up to the midpoint. Multiplicative rescaling of intervals enabled with scale. Only one of pad or scale can be provided.

Parameters:

Name Type Description Default
pad

The amount by which the intervals are additively expanded on each side. Negative values for pad shrink intervals, but not beyond the interval midpoint. Either pad or scale must be supplied.

None
scale

The factor by which to scale intervals multiplicatively on each side, e.g scale=2 doubles each interval, scale=0 returns midpoints, and scale=1 returns original intervals. Default False. Either pad or scale must be supplied.

None
side

Which side to expand, possible values are 'left', 'right' and 'both'. Default 'both'.

'both'
cols

The names of columns containing the chromosome, start and end of the genomic intervals. Default values are 'chrom', 'start', 'end'.

['chrom', 'start', 'end']
Source code in polars_bio/polars_ext.py
def expand(
    self,
    pad: Union[int, None] = None,
    scale: Union[float, None] = None,
    side: str = "both",
    cols: Union[list[str], None] = ["chrom", "start", "end"],
) -> pl.LazyFrame:
    """
    Expand each interval by an amount specified with `pad`.
    !!! Note
        Adapted to Polars API from [bioframe.expand](https://github.com/open2c/bioframe/blob/2b685eebef393c2c9e6220dcf550b3630d87518e/bioframe/ops.py#L150)

    Negative values for pad shrink the interval, up to the midpoint.
    Multiplicative rescaling of intervals enabled with scale. Only one of pad
    or scale can be provided.

    Parameters:
        pad :
            The amount by which the intervals are additively expanded *on each side*.
            Negative values for pad shrink intervals, but not beyond the interval
            midpoint. Either `pad` or `scale` must be supplied.

        scale :
            The factor by which to scale intervals multiplicatively on each side, e.g
            ``scale=2`` doubles each interval, ``scale=0`` returns midpoints, and
            ``scale=1`` returns original intervals. Default False.
            Either `pad` or `scale` must be supplied.

        side :
            Which side to expand, possible values are 'left', 'right' and 'both'.
            Default 'both'.

        cols :
            The names of columns containing the chromosome, start and end of the
            genomic intervals. Default values are 'chrom', 'start', 'end'.


    """
    df = self._ldf
    ck, sk, ek = ["chrom", "start", "end"] if cols is None else cols
    padsk = "pads"
    midsk = "mids"

    if scale is not None and pad is not None:
        raise ValueError("only one of pad or scale can be supplied")
    elif scale is not None:
        if scale < 0:
            raise ValueError("multiplicative scale must be >=0")
        df = df.with_columns(
            [(0.5 * (scale - 1) * (pl.col(ek) - pl.col(sk))).alias(padsk)]
        )
    elif pad is not None:
        if not isinstance(pad, int):
            raise ValueError("additive pad must be integer")
        df = df.with_columns([pl.lit(pad).alias(padsk)])
    else:
        raise ValueError("either pad or scale must be supplied")
    if side == "both" or side == "left":
        df = df.with_columns([(pl.col(sk) - pl.col(padsk)).alias(sk)])
    if side == "both" or side == "right":
        df = df.with_columns([(pl.col(ek) + pl.col(padsk)).alias(ek)])

    if pad is not None:
        if pad < 0:
            df = df.with_columns(
                [(pl.col(sk) + 0.5 * (pl.col(ek) - pl.col(sk))).alias(midsk)]
            )
            df = df.with_columns(
                [
                    pl.min_horizontal(pl.col(sk), pl.col(midsk))
                    .cast(pl.Int64)
                    .alias(sk),
                    pl.max_horizontal(pl.col(ek), pl.col(midsk))
                    .cast(pl.Int64)
                    .alias(ek),
                ]
            )
    if scale is not None:
        df = df.with_columns(
            [
                pl.col(sk).round(0).cast(pl.Int64).alias(sk),
                pl.col(ek).round(0).cast(pl.Int64).alias(ek),
            ]
        )
    schema = df.collect_schema().names()
    if padsk in schema:
        df = df.drop(padsk)
    if midsk in schema:
        df = df.drop(midsk)
    return df

merge(overlap_filter=FilterOp.Strict, min_dist=0, cols=None)

Note

Alias for merge

Source code in polars_bio/polars_ext.py
def merge(
    self,
    overlap_filter: FilterOp = FilterOp.Strict,
    min_dist: float = 0,
    cols: Union[list[str], None] = None,
) -> pl.LazyFrame:
    """
    !!! note
        Alias for [merge](api.md#polars_bio.merge)
    """
    return pb.merge(
        self._ldf, overlap_filter=overlap_filter, min_dist=min_dist, cols=cols
    )

nearest(other_df, suffixes=('_1', '_2'), overlap_filter=FilterOp.Strict, cols1=['chrom', 'start', 'end'], cols2=['chrom', 'start', 'end'])

Note

Alias for nearest

Source code in polars_bio/polars_ext.py
def nearest(
    self,
    other_df: pl.LazyFrame,
    suffixes: tuple[str, str] = ("_1", "_2"),
    overlap_filter=FilterOp.Strict,
    cols1=["chrom", "start", "end"],
    cols2=["chrom", "start", "end"],
) -> pl.LazyFrame:
    """
    !!! note
        Alias for [nearest](api.md#polars_bio.nearest)
    """
    return pb.nearest(
        self._ldf,
        other_df,
        overlap_filter=overlap_filter,
        suffixes=suffixes,
        cols1=cols1,
        cols2=cols2,
    )

overlap(other_df, suffixes=('_1', '_2'), how='inner', overlap_filter=FilterOp.Strict, cols1=['chrom', 'start', 'end'], cols2=['chrom', 'start', 'end'])

Note

Alias for overlap

Source code in polars_bio/polars_ext.py
def overlap(
    self,
    other_df: pl.LazyFrame,
    suffixes: tuple[str, str] = ("_1", "_2"),
    how="inner",
    overlap_filter=FilterOp.Strict,
    cols1=["chrom", "start", "end"],
    cols2=["chrom", "start", "end"],
) -> pl.LazyFrame:
    """
    !!! note
        Alias for [overlap](api.md#polars_bio.overlap)
    """
    return pb.overlap(
        self._ldf,
        other_df,
        how=how,
        overlap_filter=overlap_filter,
        suffixes=suffixes,
        cols1=cols1,
        cols2=cols2,
    )

sort(cols=['chrom', 'start', 'end'])

Sort a bedframe.

Note

Adapted to Polars API from bioframe.sort_bedframe

Parameters:

Name Type Description Default
cols Union[tuple[str], None]

The names of columns containing the chromosome, start and end of the genomic intervals.

['chrom', 'start', 'end']

Example

import polars_bio as pb
df = pb.read_table("https://www.encodeproject.org/files/ENCFF001XKR/@@download/ENCFF001XKR.bed.gz",schema="bed9")
df.pb.sort().limit(5).collect()
<class 'builtins.PyExpr'>
shape: (5, 9)
β”Œβ”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
β”‚ chrom ┆ start   ┆ end     ┆ name ┆ … ┆ strand ┆ thickStart ┆ thickEnd ┆ itemRgb  β”‚
β”‚ ---   ┆ ---     ┆ ---     ┆ ---  ┆   ┆ ---    ┆ ---        ┆ ---      ┆ ---      β”‚
β”‚ str   ┆ i64     ┆ i64     ┆ str  ┆   ┆ str    ┆ str        ┆ str      ┆ str      β”‚
β•žβ•β•β•β•β•β•β•β•ͺ═════════β•ͺ═════════β•ͺ══════β•ͺ═══β•ͺ════════β•ͺ════════════β•ͺ══════════β•ͺ══════════║
β”‚ chr1  ┆ 193500  ┆ 194500  ┆ .    ┆ … ┆ +      ┆ .          ┆ .        ┆ 179,45,0 β”‚
β”‚ chr1  ┆ 618500  ┆ 619500  ┆ .    ┆ … ┆ +      ┆ .          ┆ .        ┆ 179,45,0 β”‚
β”‚ chr1  ┆ 974500  ┆ 975500  ┆ .    ┆ … ┆ +      ┆ .          ┆ .        ┆ 179,45,0 β”‚
β”‚ chr1  ┆ 1301500 ┆ 1302500 ┆ .    ┆ … ┆ +      ┆ .          ┆ .        ┆ 179,45,0 β”‚
β”‚ chr1  ┆ 1479500 ┆ 1480500 ┆ .    ┆ … ┆ +      ┆ .          ┆ .        ┆ 179,45,0 β”‚
β””β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
Source code in polars_bio/polars_ext.py
def sort(
    self, cols: Union[tuple[str], None] = ["chrom", "start", "end"]
) -> pl.LazyFrame:
    """
    Sort a bedframe.
    !!! note
        Adapted to Polars API from [bioframe.sort_bedframe](https://github.com/open2c/bioframe/blob/2b685eebef393c2c9e6220dcf550b3630d87518e/bioframe/ops.py#L1698)

    Parameters:
        cols: The names of columns containing the chromosome, start and end of the genomic intervals.


    !!! Example
          ```python
          import polars_bio as pb
          df = pb.read_table("https://www.encodeproject.org/files/ENCFF001XKR/@@download/ENCFF001XKR.bed.gz",schema="bed9")
          df.pb.sort().limit(5).collect()
          ```
            ```plaintext
            <class 'builtins.PyExpr'>
            shape: (5, 9)
            β”Œβ”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
            β”‚ chrom ┆ start   ┆ end     ┆ name ┆ … ┆ strand ┆ thickStart ┆ thickEnd ┆ itemRgb  β”‚
            β”‚ ---   ┆ ---     ┆ ---     ┆ ---  ┆   ┆ ---    ┆ ---        ┆ ---      ┆ ---      β”‚
            β”‚ str   ┆ i64     ┆ i64     ┆ str  ┆   ┆ str    ┆ str        ┆ str      ┆ str      β”‚
            β•žβ•β•β•β•β•β•β•β•ͺ═════════β•ͺ═════════β•ͺ══════β•ͺ═══β•ͺ════════β•ͺ════════════β•ͺ══════════β•ͺ══════════║
            β”‚ chr1  ┆ 193500  ┆ 194500  ┆ .    ┆ … ┆ +      ┆ .          ┆ .        ┆ 179,45,0 β”‚
            β”‚ chr1  ┆ 618500  ┆ 619500  ┆ .    ┆ … ┆ +      ┆ .          ┆ .        ┆ 179,45,0 β”‚
            β”‚ chr1  ┆ 974500  ┆ 975500  ┆ .    ┆ … ┆ +      ┆ .          ┆ .        ┆ 179,45,0 β”‚
            β”‚ chr1  ┆ 1301500 ┆ 1302500 ┆ .    ┆ … ┆ +      ┆ .          ┆ .        ┆ 179,45,0 β”‚
            β”‚ chr1  ┆ 1479500 ┆ 1480500 ┆ .    ┆ … ┆ +      ┆ .          ┆ .        ┆ 179,45,0 β”‚
            β””β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜

            ```

    """
    return self._ldf.sort(by=cols)

count_overlaps(df1, df2, overlap_filter=FilterOp.Strict, suffixes=('', '_'), cols1=['chrom', 'start', 'end'], cols2=['chrom', 'start', 'end'], on_cols=None, output_type='polars.LazyFrame', streaming=False, naive_query=True)

Count pairs of overlapping genomic intervals. Bioframe inspired API.

Parameters:

Name Type Description Default
df1 Union[str, DataFrame, LazyFrame, DataFrame]

Can be a path to a file, a polars DataFrame, or a pandas DataFrame or a registered table (see register_vcf). CSV with a header, BED and Parquet are supported.

required
df2 Union[str, DataFrame, LazyFrame, DataFrame]

Can be a path to a file, a polars DataFrame, or a pandas DataFrame or a registered table. CSV with a header, BED and Parquet are supported.

required
overlap_filter FilterOp

FilterOp, optional. The type of overlap to consider(Weak or Strict). Strict for 0-based, Weak for 1-based coordinate systems.

Strict
suffixes tuple[str, str]

Suffixes for the columns of the two overlapped sets.

('', '_')
cols1 Union[list[str], None]

The names of columns containing the chromosome, start and end of the genomic intervals, provided separately for each set.

['chrom', 'start', 'end']
cols2 Union[list[str], None]

The names of columns containing the chromosome, start and end of the genomic intervals, provided separately for each set.

['chrom', 'start', 'end']
on_cols Union[list[str], None]

List of additional column names to join on. default is None.

None
output_type str

Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.

'polars.LazyFrame'
naive_query bool

If True, use naive query for counting overlaps based on overlaps.

True
streaming bool

EXPERIMENTAL If True, use Polars streaming engine.

False

Returns: polars.LazyFrame or polars.DataFrame or pandas.DataFrame of the overlapping intervals.

Example
import polars_bio as pb
import pandas as pd

df1 = pd.DataFrame([
    ['chr1', 1, 5],
    ['chr1', 3, 8],
    ['chr1', 8, 10],
    ['chr1', 12, 14]],
columns=['chrom', 'start', 'end']
)

df2 = pd.DataFrame(
[['chr1', 4, 8],
 ['chr1', 10, 11]],
columns=['chrom', 'start', 'end' ]
)
counts = pb.count_overlaps(df1, df2, output_type="pandas.DataFrame")

counts

chrom  start  end  count
0  chr1      1    5      1
1  chr1      3    8      1
2  chr1      8   10      0
3  chr1     12   14      0
Todo

Support return_input.

Source code in polars_bio/range_op.py
def count_overlaps(
    df1: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
    df2: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
    overlap_filter: FilterOp = FilterOp.Strict,
    suffixes: tuple[str, str] = ("", "_"),
    cols1: Union[list[str], None] = ["chrom", "start", "end"],
    cols2: Union[list[str], None] = ["chrom", "start", "end"],
    on_cols: Union[list[str], None] = None,
    output_type: str = "polars.LazyFrame",
    streaming: bool = False,
    naive_query: bool = True,
) -> Union[pl.LazyFrame, pl.DataFrame, pd.DataFrame, datafusion.DataFrame]:
    """
    Count pairs of overlapping genomic intervals.
    Bioframe inspired API.

    Parameters:
        df1: Can be a path to a file, a polars DataFrame, or a pandas DataFrame or a registered table (see [register_vcf](api.md#polars_bio.register_vcf)). CSV with a header, BED and Parquet are supported.
        df2: Can be a path to a file, a polars DataFrame, or a pandas DataFrame or a registered table. CSV with a header, BED  and Parquet are supported.
        overlap_filter: FilterOp, optional. The type of overlap to consider(Weak or Strict). Strict for **0-based**, Weak for **1-based** coordinate systems.
        suffixes: Suffixes for the columns of the two overlapped sets.
        cols1: The names of columns containing the chromosome, start and end of the
            genomic intervals, provided separately for each set.
        cols2:  The names of columns containing the chromosome, start and end of the
            genomic intervals, provided separately for each set.
        on_cols: List of additional column names to join on. default is None.
        output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
        naive_query: If True, use naive query for counting overlaps based on overlaps.
        streaming: **EXPERIMENTAL** If True, use Polars [streaming](features.md#streaming) engine.
    Returns:
        **polars.LazyFrame** or polars.DataFrame or pandas.DataFrame of the overlapping intervals.

    Example:
        ```python
        import polars_bio as pb
        import pandas as pd

        df1 = pd.DataFrame([
            ['chr1', 1, 5],
            ['chr1', 3, 8],
            ['chr1', 8, 10],
            ['chr1', 12, 14]],
        columns=['chrom', 'start', 'end']
        )

        df2 = pd.DataFrame(
        [['chr1', 4, 8],
         ['chr1', 10, 11]],
        columns=['chrom', 'start', 'end' ]
        )
        counts = pb.count_overlaps(df1, df2, output_type="pandas.DataFrame")

        counts

        chrom  start  end  count
        0  chr1      1    5      1
        1  chr1      3    8      1
        2  chr1      8   10      0
        3  chr1     12   14      0
        ```

    Todo:
         Support return_input.
    """
    _validate_overlap_input(cols1, cols2, on_cols, suffixes, output_type, how="inner")
    my_ctx = get_py_ctx()
    on_cols = [] if on_cols is None else on_cols
    cols1 = DEFAULT_INTERVAL_COLUMNS if cols1 is None else cols1
    cols2 = DEFAULT_INTERVAL_COLUMNS if cols2 is None else cols2
    if naive_query:
        range_options = RangeOptions(
            range_op=RangeOp.CountOverlapsNaive,
            filter_op=overlap_filter,
            suffixes=suffixes,
            columns_1=cols1,
            columns_2=cols2,
            streaming=streaming,
        )
        return range_operation(df2, df1, range_options, output_type, ctx)
    df1 = read_df_to_datafusion(my_ctx, df1)
    df2 = read_df_to_datafusion(my_ctx, df2)

    # TODO: guarantee no collisions
    s1start_s2end = "s1starts2end"
    s1end_s2start = "s1ends2start"
    contig = "contig"
    count = "count"
    starts = "starts"
    ends = "ends"
    is_s1 = "is_s1"
    suff, _ = suffixes
    df1, df2 = df2, df1
    df1 = df1.select(
        *(
            [
                literal(1).alias(is_s1),
                col(cols1[1]).alias(s1start_s2end),
                col(cols1[2]).alias(s1end_s2start),
                col(cols1[0]).alias(contig),
            ]
            + on_cols
        )
    )
    df2 = df2.select(
        *(
            [
                literal(0).alias(is_s1),
                col(cols2[2]).alias(s1end_s2start),
                col(cols2[1]).alias(s1start_s2end),
                col(cols2[0]).alias(contig),
            ]
            + on_cols
        )
    )

    df = df1.union(df2)

    partitioning = [col(contig)] + [col(c) for c in on_cols]
    df = df.select(
        *(
            [
                s1start_s2end,
                s1end_s2start,
                contig,
                is_s1,
                datafusion.functions.sum(col(is_s1))
                .over(
                    datafusion.expr.Window(
                        partition_by=partitioning,
                        order_by=[
                            col(s1start_s2end).sort(),
                            col(is_s1).sort(
                                ascending=(overlap_filter == FilterOp.Strict)
                            ),
                        ],
                    )
                )
                .alias(starts),
                datafusion.functions.sum(col(is_s1))
                .over(
                    datafusion.expr.Window(
                        partition_by=partitioning,
                        order_by=[
                            col(s1end_s2start).sort(),
                            col(is_s1).sort(
                                ascending=(overlap_filter == FilterOp.Weak)
                            ),
                        ],
                    )
                )
                .alias(ends),
            ]
            + on_cols
        )
    )
    df = df.filter(col(is_s1) == 0)
    df = df.select(
        *(
            [
                col(contig).alias(cols1[0] + suff),
                col(s1end_s2start).alias(cols1[1] + suff),
                col(s1start_s2end).alias(cols1[2] + suff),
            ]
            + on_cols
            + [(col(starts) - col(ends)).alias(count)]
        )
    )

    return convert_result(df, output_type, streaming)

coverage(df1, df2, overlap_filter=FilterOp.Strict, suffixes=('_1', '_2'), on_cols=None, cols1=['chrom', 'start', 'end'], cols2=['chrom', 'start', 'end'], output_type='polars.LazyFrame', streaming=False, read_options=None)

Calculate intervals coverage. Bioframe inspired API.

Parameters:

Name Type Description Default
df1 Union[str, DataFrame, LazyFrame, DataFrame]

Can be a path to a file, a polars DataFrame, or a pandas DataFrame or a registered table (see register_vcf). CSV with a header, BED and Parquet are supported.

required
df2 Union[str, DataFrame, LazyFrame, DataFrame]

Can be a path to a file, a polars DataFrame, or a pandas DataFrame or a registered table. CSV with a header, BED and Parquet are supported.

required
overlap_filter FilterOp

FilterOp, optional. The type of overlap to consider(Weak or Strict). Strict for 0-based, Weak for 1-based coordinate systems.

Strict
cols1 Union[list[str], None]

The names of columns containing the chromosome, start and end of the genomic intervals, provided separately for each set.

['chrom', 'start', 'end']
cols2 Union[list[str], None]

The names of columns containing the chromosome, start and end of the genomic intervals, provided separately for each set.

['chrom', 'start', 'end']
suffixes tuple[str, str]

Suffixes for the columns of the two overlapped sets.

('_1', '_2')
on_cols Union[list[str], None]

List of additional column names to join on. default is None.

None
output_type str

Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.

'polars.LazyFrame'
streaming bool

EXPERIMENTAL If True, use Polars streaming engine.

False
read_options Union[ReadOptions, None]

Additional options for reading the input files.

None

Returns:

Type Description
Union[LazyFrame, DataFrame, DataFrame, DataFrame]

polars.LazyFrame or polars.DataFrame or pandas.DataFrame of the overlapping intervals.

Note

The default output format, i.e. LazyFrame, is recommended for large datasets as it supports output streaming and lazy evaluation. This enables efficient processing of large datasets without loading the entire output dataset into memory.

Example:

Todo

Support for on_cols.

Source code in polars_bio/range_op.py
def coverage(
    df1: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
    df2: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
    overlap_filter: FilterOp = FilterOp.Strict,
    suffixes: tuple[str, str] = ("_1", "_2"),
    on_cols: Union[list[str], None] = None,
    cols1: Union[list[str], None] = ["chrom", "start", "end"],
    cols2: Union[list[str], None] = ["chrom", "start", "end"],
    output_type: str = "polars.LazyFrame",
    streaming: bool = False,
    read_options: Union[ReadOptions, None] = None,
) -> Union[pl.LazyFrame, pl.DataFrame, pd.DataFrame, datafusion.DataFrame]:
    """
    Calculate intervals coverage.
    Bioframe inspired API.

    Parameters:
        df1: Can be a path to a file, a polars DataFrame, or a pandas DataFrame or a registered table (see [register_vcf](api.md#polars_bio.register_vcf)). CSV with a header, BED and Parquet are supported.
        df2: Can be a path to a file, a polars DataFrame, or a pandas DataFrame or a registered table. CSV with a header, BED  and Parquet are supported.
        overlap_filter: FilterOp, optional. The type of overlap to consider(Weak or Strict). Strict for **0-based**, Weak for **1-based** coordinate systems.
        cols1: The names of columns containing the chromosome, start and end of the
            genomic intervals, provided separately for each set.
        cols2:  The names of columns containing the chromosome, start and end of the
            genomic intervals, provided separately for each set.
        suffixes: Suffixes for the columns of the two overlapped sets.
        on_cols: List of additional column names to join on. default is None.
        output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
        streaming: **EXPERIMENTAL** If True, use Polars [streaming](features.md#streaming) engine.
        read_options: Additional options for reading the input files.


    Returns:
        **polars.LazyFrame** or polars.DataFrame or pandas.DataFrame of the overlapping intervals.

    Note:
        The default output format, i.e. [LazyFrame](https://docs.pola.rs/api/python/stable/reference/lazyframe/index.html), is recommended for large datasets as it supports output streaming and lazy evaluation.
        This enables efficient processing of large datasets without loading the entire output dataset into memory.

    Example:

    Todo:
        Support for on_cols.
    """

    _validate_overlap_input(cols1, cols2, on_cols, suffixes, output_type, how="inner")

    cols1 = DEFAULT_INTERVAL_COLUMNS if cols1 is None else cols1
    cols2 = DEFAULT_INTERVAL_COLUMNS if cols2 is None else cols2
    range_options = RangeOptions(
        range_op=RangeOp.Coverage,
        filter_op=overlap_filter,
        suffixes=suffixes,
        columns_1=cols1,
        columns_2=cols2,
        streaming=streaming,
    )
    return range_operation(df2, df1, range_options, output_type, ctx, read_options)

describe_vcf(path)

Describe VCF INFO schema.

Parameters:

Name Type Description Default
path str

The path to the VCF file.

required

Example

import polars_bio as pb
vcf_1 = "gs://gcp-public-data--gnomad/release/4.1/genome_sv/gnomad.v4.1.sv.sites.vcf.gz"
pb.describe_vcf(vcf_1).sort("name").limit(5)
    shape: (5, 3)
β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
β”‚ name      ┆ type    ┆ description                                                                          β”‚
β”‚ ---       ┆ ---     ┆ ---                                                                                  β”‚
β”‚ str       ┆ str     ┆ str                                                                                  β”‚
β•žβ•β•β•β•β•β•β•β•β•β•β•β•ͺ═════════β•ͺ══════════════════════════════════════════════════════════════════════════════════════║
β”‚ AC        ┆ Integer ┆ Number of non-reference alleles observed (biallelic sites only).                     β”‚
β”‚ AC_XX     ┆ Integer ┆ Number of non-reference XX alleles observed (biallelic sites only).                  β”‚
β”‚ AC_XY     ┆ Integer ┆ Number of non-reference XY alleles observed (biallelic sites only).                  β”‚
β”‚ AC_afr    ┆ Integer ┆ Number of non-reference African-American alleles observed (biallelic sites only).    β”‚
β”‚ AC_afr_XX ┆ Integer ┆ Number of non-reference African-American XX alleles observed (biallelic sites only). β”‚
β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
Source code in polars_bio/io.py
def describe_vcf(path: str) -> pl.DataFrame:
    """
    Describe VCF INFO schema.

    Parameters:
        path: The path to the VCF file.

    !!! Example
        ```python
        import polars_bio as pb
        vcf_1 = "gs://gcp-public-data--gnomad/release/4.1/genome_sv/gnomad.v4.1.sv.sites.vcf.gz"
        pb.describe_vcf(vcf_1).sort("name").limit(5)
        ```

        ```shell
            shape: (5, 3)
        β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
        β”‚ name      ┆ type    ┆ description                                                                          β”‚
        β”‚ ---       ┆ ---     ┆ ---                                                                                  β”‚
        β”‚ str       ┆ str     ┆ str                                                                                  β”‚
        β•žβ•β•β•β•β•β•β•β•β•β•β•β•ͺ═════════β•ͺ══════════════════════════════════════════════════════════════════════════════════════║
        β”‚ AC        ┆ Integer ┆ Number of non-reference alleles observed (biallelic sites only).                     β”‚
        β”‚ AC_XX     ┆ Integer ┆ Number of non-reference XX alleles observed (biallelic sites only).                  β”‚
        β”‚ AC_XY     ┆ Integer ┆ Number of non-reference XY alleles observed (biallelic sites only).                  β”‚
        β”‚ AC_afr    ┆ Integer ┆ Number of non-reference African-American alleles observed (biallelic sites only).    β”‚
        β”‚ AC_afr_XX ┆ Integer ┆ Number of non-reference African-American XX alleles observed (biallelic sites only). β”‚
        β””β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜


        ```
    """
    return py_describe_vcf(ctx, path).to_polars()

from_polars(name, df)

Register a Polars DataFrame as a DataFusion table.

Parameters:

Name Type Description Default
name str

The name of the table.

required
df Union[DataFrame, LazyFrame]

The Polars DataFrame.

required

Example

import polars as pl
import polars_bio as pb
df = pl.DataFrame({
    "a": [1, 2, 3],
    "b": [4, 5, 6]
})
pb.from_polars("test_df", df)
pb.sql("SELECT * FROM test_df").collect()
3rows [00:00, 2978.91rows/s]
shape: (3, 2)
β”Œβ”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”
β”‚ a   ┆ b   β”‚
β”‚ --- ┆ --- β”‚
β”‚ i64 ┆ i64 β”‚
β•žβ•β•β•β•β•β•ͺ═════║
β”‚ 1   ┆ 4   β”‚
β”‚ 2   ┆ 5   β”‚
β”‚ 3   ┆ 6   β”‚
β””β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”˜
Source code in polars_bio/io.py
def from_polars(name: str, df: Union[pl.DataFrame, pl.LazyFrame]) -> None:
    """
    Register a Polars DataFrame as a DataFusion table.

    Parameters:
        name: The name of the table.
        df: The Polars DataFrame.
    !!! Example
        ```python
        import polars as pl
        import polars_bio as pb
        df = pl.DataFrame({
            "a": [1, 2, 3],
            "b": [4, 5, 6]
        })
        pb.from_polars("test_df", df)
        pb.sql("SELECT * FROM test_df").collect()
        ```
        ```shell
        3rows [00:00, 2978.91rows/s]
        shape: (3, 2)
        β”Œβ”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”
        β”‚ a   ┆ b   β”‚
        β”‚ --- ┆ --- β”‚
        β”‚ i64 ┆ i64 β”‚
        β•žβ•β•β•β•β•β•ͺ═════║
        β”‚ 1   ┆ 4   β”‚
        β”‚ 2   ┆ 5   β”‚
        β”‚ 3   ┆ 6   β”‚
        β””β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”˜
        ```
    """
    reader = (
        df.to_arrow()
        if isinstance(df, pl.DataFrame)
        else df.collect().to_arrow().to_reader()
    )
    py_from_polars(ctx, name, reader)

merge(df, overlap_filter=FilterOp.Strict, min_dist=0, cols=['chrom', 'start', 'end'], on_cols=None, output_type='polars.LazyFrame', streaming=False)

Merge overlapping intervals. It is assumed that start < end.

Parameters:

Name Type Description Default
df Union[str, DataFrame, LazyFrame, DataFrame]

Can be a path to a file, a polars DataFrame, or a pandas DataFrame. CSV with a header, BED and Parquet are supported.

required
overlap_filter FilterOp

FilterOp, optional. The type of overlap to consider(Weak or Strict). Strict for 0-based, Weak for 1-based coordinate systems.

Strict
cols Union[list[str], None]

The names of columns containing the chromosome, start and end of the genomic intervals, provided separately for each set.

['chrom', 'start', 'end']
on_cols Union[list[str], None]

List of additional column names for clustering. default is None.

None
output_type str

Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.

'polars.LazyFrame'
streaming bool

EXPERIMENTAL If True, use Polars streaming engine.

False

Returns:

Type Description
Union[LazyFrame, DataFrame, DataFrame, DataFrame]

polars.LazyFrame or polars.DataFrame or pandas.DataFrame of the overlapping intervals.

Example:

Todo

Support for on_cols.

Source code in polars_bio/range_op.py
def merge(
    df: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
    overlap_filter: FilterOp = FilterOp.Strict,
    min_dist: float = 0,
    cols: Union[list[str], None] = ["chrom", "start", "end"],
    on_cols: Union[list[str], None] = None,
    output_type: str = "polars.LazyFrame",
    streaming: bool = False,
) -> Union[pl.LazyFrame, pl.DataFrame, pd.DataFrame, datafusion.DataFrame]:
    """
    Merge overlapping intervals. It is assumed that start < end.


    Parameters:
        df: Can be a path to a file, a polars DataFrame, or a pandas DataFrame. CSV with a header, BED  and Parquet are supported.
        overlap_filter: FilterOp, optional. The type of overlap to consider(Weak or Strict). Strict for **0-based**, Weak for **1-based** coordinate systems.
        cols: The names of columns containing the chromosome, start and end of the
            genomic intervals, provided separately for each set.
        on_cols: List of additional column names for clustering. default is None.
        output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
        streaming: **EXPERIMENTAL** If True, use Polars [streaming](features.md#streaming) engine.

    Returns:
        **polars.LazyFrame** or polars.DataFrame or pandas.DataFrame of the overlapping intervals.

    Example:

    Todo:
        Support for on_cols.
    """
    suffixes = ("_1", "_2")
    _validate_overlap_input(cols, cols, on_cols, suffixes, output_type, how="inner")

    my_ctx = get_py_ctx()
    cols = DEFAULT_INTERVAL_COLUMNS if cols is None else cols
    contig = cols[0]
    start = cols[1]
    end = cols[2]

    on_cols = [] if on_cols is None else on_cols
    on_cols = [contig] + on_cols

    df = read_df_to_datafusion(my_ctx, df)
    df_schema = df.schema()
    start_type = df_schema.field(start).type
    end_type = df_schema.field(end).type
    # TODO: make sure to avoid conflicting column names
    start_end = "start_end"
    is_start_end = "is_start_or_end"
    current_intervals = "current_intervals"
    n_intervals = "n_intervals"

    end_positions = df.select(
        *(
            [(col(end) + min_dist).alias(start_end), literal(-1).alias(is_start_end)]
            + on_cols
        )
    )
    start_positions = df.select(
        *([col(start).alias(start_end), literal(1).alias(is_start_end)] + on_cols)
    )
    all_positions = start_positions.union(end_positions)
    start_end_type = all_positions.schema().field(start_end).type
    all_positions = all_positions.select(
        *([col(start_end).cast(start_end_type), col(is_start_end)] + on_cols)
    )

    sorting = [
        col(start_end).sort(),
        col(is_start_end).sort(ascending=(overlap_filter == FilterOp.Strict)),
    ]
    all_positions = all_positions.sort(*sorting)

    on_cols_expr = [col(c) for c in on_cols]

    win = datafusion.expr.Window(
        partition_by=on_cols_expr,
        order_by=sorting,
    )
    all_positions = all_positions.select(
        *(
            [
                start_end,
                is_start_end,
                datafusion.functions.sum(col(is_start_end))
                .over(win)
                .alias(current_intervals),
            ]
            + on_cols
            + [
                datafusion.functions.row_number(
                    partition_by=on_cols_expr, order_by=sorting
                ).alias(n_intervals)
            ]
        )
    )
    all_positions = all_positions.filter(
        ((col(current_intervals) == 0) & (col(is_start_end) == -1))
        | ((col(current_intervals) == 1) & (col(is_start_end) == 1))
    )
    all_positions = all_positions.select(
        *(
            [start_end, is_start_end]
            + on_cols
            + [
                (
                    (
                        col(n_intervals)
                        - datafusion.functions.lag(
                            col(n_intervals), partition_by=on_cols_expr
                        )
                        + 1
                    )
                    / 2
                ).alias(n_intervals)
            ]
        )
    )
    result = all_positions.select(
        *(
            [
                (col(start_end) - min_dist).alias(end),
                is_start_end,
                datafusion.functions.lag(
                    col(start_end), partition_by=on_cols_expr
                ).alias(start),
            ]
            + on_cols
            + [n_intervals]
        )
    )
    result = result.filter(col(is_start_end) == -1)
    result = result.select(
        *(
            [contig, col(start).cast(start_type), col(end).cast(end_type)]
            + on_cols[1:]
            + [n_intervals]
        )
    )

    return convert_result(result, output_type, streaming)

nearest(df1, df2, overlap_filter=FilterOp.Strict, suffixes=('_1', '_2'), on_cols=None, cols1=['chrom', 'start', 'end'], cols2=['chrom', 'start', 'end'], output_type='polars.LazyFrame', streaming=False, read_options=None)

Find pairs of closest genomic intervals. Bioframe inspired API.

Parameters:

Name Type Description Default
df1 Union[str, DataFrame, LazyFrame, DataFrame]

Can be a path to a file, a polars DataFrame, or a pandas DataFrame or a registered table (see register_vcf). CSV with a header, BED and Parquet are supported.

required
df2 Union[str, DataFrame, LazyFrame, DataFrame]

Can be a path to a file, a polars DataFrame, or a pandas DataFrame or a registered table. CSV with a header, BED and Parquet are supported.

required
overlap_filter FilterOp

FilterOp, optional. The type of overlap to consider(Weak or Strict). Strict for 0-based, Weak for 1-based coordinate systems.

Strict
cols1 Union[list[str], None]

The names of columns containing the chromosome, start and end of the genomic intervals, provided separately for each set.

['chrom', 'start', 'end']
cols2 Union[list[str], None]

The names of columns containing the chromosome, start and end of the genomic intervals, provided separately for each set.

['chrom', 'start', 'end']
suffixes tuple[str, str]

Suffixes for the columns of the two overlapped sets.

('_1', '_2')
on_cols Union[list[str], None]

List of additional column names to join on. default is None.

None
output_type str

Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.

'polars.LazyFrame'
streaming bool

EXPERIMENTAL If True, use Polars streaming engine.

False
read_options Union[ReadOptions, None]

Additional options for reading the input files.

None

Returns:

Type Description
Union[LazyFrame, DataFrame, DataFrame, DataFrame]

polars.LazyFrame or polars.DataFrame or pandas.DataFrame of the overlapping intervals.

Note

The default output format, i.e. LazyFrame, is recommended for large datasets as it supports output streaming and lazy evaluation. This enables efficient processing of large datasets without loading the entire output dataset into memory.

Example:

Todo

Support for on_cols.

Source code in polars_bio/range_op.py
def nearest(
    df1: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
    df2: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
    overlap_filter: FilterOp = FilterOp.Strict,
    suffixes: tuple[str, str] = ("_1", "_2"),
    on_cols: Union[list[str], None] = None,
    cols1: Union[list[str], None] = ["chrom", "start", "end"],
    cols2: Union[list[str], None] = ["chrom", "start", "end"],
    output_type: str = "polars.LazyFrame",
    streaming: bool = False,
    read_options: Union[ReadOptions, None] = None,
) -> Union[pl.LazyFrame, pl.DataFrame, pd.DataFrame, datafusion.DataFrame]:
    """
    Find pairs of closest genomic intervals.
    Bioframe inspired API.

    Parameters:
        df1: Can be a path to a file, a polars DataFrame, or a pandas DataFrame or a registered table (see [register_vcf](api.md#polars_bio.register_vcf)). CSV with a header, BED and Parquet are supported.
        df2: Can be a path to a file, a polars DataFrame, or a pandas DataFrame or a registered table. CSV with a header, BED  and Parquet are supported.
        overlap_filter: FilterOp, optional. The type of overlap to consider(Weak or Strict). Strict for **0-based**, Weak for **1-based** coordinate systems.
        cols1: The names of columns containing the chromosome, start and end of the
            genomic intervals, provided separately for each set.
        cols2:  The names of columns containing the chromosome, start and end of the
            genomic intervals, provided separately for each set.
        suffixes: Suffixes for the columns of the two overlapped sets.
        on_cols: List of additional column names to join on. default is None.
        output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
        streaming: **EXPERIMENTAL** If True, use Polars [streaming](features.md#streaming) engine.
        read_options: Additional options for reading the input files.


    Returns:
        **polars.LazyFrame** or polars.DataFrame or pandas.DataFrame of the overlapping intervals.

    Note:
        The default output format, i.e. [LazyFrame](https://docs.pola.rs/api/python/stable/reference/lazyframe/index.html), is recommended for large datasets as it supports output streaming and lazy evaluation.
        This enables efficient processing of large datasets without loading the entire output dataset into memory.

    Example:

    Todo:
        Support for on_cols.
    """

    _validate_overlap_input(cols1, cols2, on_cols, suffixes, output_type, how="inner")

    cols1 = DEFAULT_INTERVAL_COLUMNS if cols1 is None else cols1
    cols2 = DEFAULT_INTERVAL_COLUMNS if cols2 is None else cols2
    range_options = RangeOptions(
        range_op=RangeOp.Nearest,
        filter_op=overlap_filter,
        suffixes=suffixes,
        columns_1=cols1,
        columns_2=cols2,
        streaming=streaming,
    )
    return range_operation(df1, df2, range_options, output_type, ctx, read_options)

overlap(df1, df2, how='inner', overlap_filter=FilterOp.Strict, suffixes=('_1', '_2'), on_cols=None, cols1=['chrom', 'start', 'end'], cols2=['chrom', 'start', 'end'], algorithm='Coitrees', output_type='polars.LazyFrame', streaming=False, read_options1=None, read_options2=None)

Find pairs of overlapping genomic intervals. Bioframe inspired API.

Parameters:

Name Type Description Default
df1 Union[str, DataFrame, LazyFrame, DataFrame]

Can be a path to a file, a polars DataFrame, or a pandas DataFrame or a registered table (see register_vcf). CSV with a header, BED and Parquet are supported.

required
df2 Union[str, DataFrame, LazyFrame, DataFrame]

Can be a path to a file, a polars DataFrame, or a pandas DataFrame or a registered table. CSV with a header, BED and Parquet are supported.

required
how str

How to handle the overlaps on the two dataframes. inner: use intersection of the set of intervals from df1 and df2, optional.

'inner'
overlap_filter FilterOp

FilterOp, optional. The type of overlap to consider(Weak or Strict). Strict for 0-based, Weak for 1-based coordinate systems.

Strict
cols1 Union[list[str], None]

The names of columns containing the chromosome, start and end of the genomic intervals, provided separately for each set.

['chrom', 'start', 'end']
cols2 Union[list[str], None]

The names of columns containing the chromosome, start and end of the genomic intervals, provided separately for each set.

['chrom', 'start', 'end']
suffixes tuple[str, str]

Suffixes for the columns of the two overlapped sets.

('_1', '_2')
on_cols Union[list[str], None]

List of additional column names to join on. default is None.

None
algorithm str

The algorithm to use for the overlap operation.

'Coitrees'
output_type str

Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.

'polars.LazyFrame'
streaming bool

EXPERIMENTAL If True, use Polars streaming engine.

False
read_options1 Union[ReadOptions, None]

Additional options for reading the input files.

None
read_options2 Union[ReadOptions, None]

Additional options for reading the input files.

None

Returns:

Type Description
Union[LazyFrame, DataFrame, DataFrame, DataFrame]

polars.LazyFrame or polars.DataFrame or pandas.DataFrame of the overlapping intervals.

Note
  1. The default output format, i.e. LazyFrame, is recommended for large datasets as it supports output streaming and lazy evaluation. This enables efficient processing of large datasets without loading the entire output dataset into memory.
  2. Streaming is only supported for polars.LazyFrame output.
Example
import polars_bio as pb
import pandas as pd

df1 = pd.DataFrame([
    ['chr1', 1, 5],
    ['chr1', 3, 8],
    ['chr1', 8, 10],
    ['chr1', 12, 14]],
columns=['chrom', 'start', 'end']
)

df2 = pd.DataFrame(
[['chr1', 4, 8],
 ['chr1', 10, 11]],
columns=['chrom', 'start', 'end' ]
)
overlapping_intervals = pb.overlap(df1, df2, output_type="pandas.DataFrame")

overlapping_intervals
    chrom_1         start_1     end_1 chrom_2       start_2  end_2
0     chr1            1          5     chr1            4          8
1     chr1            3          8     chr1            4          8
Todo

Support for on_cols.

Source code in polars_bio/range_op.py
def overlap(
    df1: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
    df2: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
    how: str = "inner",
    overlap_filter: FilterOp = FilterOp.Strict,
    suffixes: tuple[str, str] = ("_1", "_2"),
    on_cols: Union[list[str], None] = None,
    cols1: Union[list[str], None] = ["chrom", "start", "end"],
    cols2: Union[list[str], None] = ["chrom", "start", "end"],
    algorithm: str = "Coitrees",
    output_type: str = "polars.LazyFrame",
    streaming: bool = False,
    read_options1: Union[ReadOptions, None] = None,
    read_options2: Union[ReadOptions, None] = None,
) -> Union[pl.LazyFrame, pl.DataFrame, pd.DataFrame, datafusion.DataFrame]:
    """
    Find pairs of overlapping genomic intervals.
    Bioframe inspired API.

    Parameters:
        df1: Can be a path to a file, a polars DataFrame, or a pandas DataFrame or a registered table (see [register_vcf](api.md#polars_bio.register_vcf)). CSV with a header, BED and Parquet are supported.
        df2: Can be a path to a file, a polars DataFrame, or a pandas DataFrame or a registered table. CSV with a header, BED  and Parquet are supported.
        how: How to handle the overlaps on the two dataframes. inner: use intersection of the set of intervals from df1 and df2, optional.
        overlap_filter: FilterOp, optional. The type of overlap to consider(Weak or Strict). Strict for **0-based**, Weak for **1-based** coordinate systems.
        cols1: The names of columns containing the chromosome, start and end of the
            genomic intervals, provided separately for each set.
        cols2:  The names of columns containing the chromosome, start and end of the
            genomic intervals, provided separately for each set.
        suffixes: Suffixes for the columns of the two overlapped sets.
        on_cols: List of additional column names to join on. default is None.
        algorithm: The algorithm to use for the overlap operation.
        output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
        streaming: **EXPERIMENTAL** If True, use Polars [streaming](features.md#streaming) engine.
        read_options1: Additional options for reading the input files.
        read_options2: Additional options for reading the input files.

    Returns:
        **polars.LazyFrame** or polars.DataFrame or pandas.DataFrame of the overlapping intervals.

    Note:
        1. The default output format, i.e.  [LazyFrame](https://docs.pola.rs/api/python/stable/reference/lazyframe/index.html), is recommended for large datasets as it supports output streaming and lazy evaluation.
        This enables efficient processing of large datasets without loading the entire output dataset into memory.
        2. Streaming is only supported for polars.LazyFrame output.

    Example:
        ```python
        import polars_bio as pb
        import pandas as pd

        df1 = pd.DataFrame([
            ['chr1', 1, 5],
            ['chr1', 3, 8],
            ['chr1', 8, 10],
            ['chr1', 12, 14]],
        columns=['chrom', 'start', 'end']
        )

        df2 = pd.DataFrame(
        [['chr1', 4, 8],
         ['chr1', 10, 11]],
        columns=['chrom', 'start', 'end' ]
        )
        overlapping_intervals = pb.overlap(df1, df2, output_type="pandas.DataFrame")

        overlapping_intervals
            chrom_1         start_1     end_1 chrom_2       start_2  end_2
        0     chr1            1          5     chr1            4          8
        1     chr1            3          8     chr1            4          8

        ```

    Todo:
         Support for on_cols.
    """

    _validate_overlap_input(cols1, cols2, on_cols, suffixes, output_type, how)

    cols1 = DEFAULT_INTERVAL_COLUMNS if cols1 is None else cols1
    cols2 = DEFAULT_INTERVAL_COLUMNS if cols2 is None else cols2
    range_options = RangeOptions(
        range_op=RangeOp.Overlap,
        filter_op=overlap_filter,
        suffixes=suffixes,
        columns_1=cols1,
        columns_2=cols2,
        overlap_alg=algorithm,
        streaming=streaming,
    )
    return range_operation(
        df1, df2, range_options, output_type, ctx, read_options1, read_options2
    )

read_bam(path)

Read a BAM file into a LazyFrame.

Parameters:

Name Type Description Default
path str

The path to the BAM file.

required
Source code in polars_bio/io.py
def read_bam(path: str) -> pl.LazyFrame:
    """
    Read a BAM file into a LazyFrame.

    Parameters:
        path: The path to the BAM file.
    """
    df = read_file(path, InputFormat.Bam, None)
    return lazy_scan(df)

read_fasta(path)

Read a FASTA file into a LazyFrame.

Parameters:

Name Type Description Default
path str

The path to the FASTA file.

required
Source code in polars_bio/io.py
def read_fasta(path: str) -> pl.LazyFrame:
    """
    Read a FASTA file into a LazyFrame.

    Parameters:
        path: The path to the FASTA file.
    """
    df = read_file(path, InputFormat.Fasta, None)
    return lazy_scan(df)

read_fastq(path)

Read a FASTQ file into a LazyFrame.

Parameters:

Name Type Description Default
path str

The path to the FASTQ file.

required
Source code in polars_bio/io.py
def read_fastq(path: str) -> pl.LazyFrame:
    """
    Read a FASTQ file into a LazyFrame.

    Parameters:
        path: The path to the FASTQ file.
    """
    df = read_file(path, InputFormat.Fastq, None)
    return lazy_scan(df)

read_table(path, schema=None, **kwargs)

Read a tab-delimited (i.e. BED) file into a Polars LazyFrame. Tries to be compatible with Bioframe's read_table but faster and lazy. Schema should follow the Bioframe's schema format.

Parameters:

Name Type Description Default
path str

The path to the file.

required
schema Dict

Schema should follow the Bioframe's schema format.

None
Source code in polars_bio/io.py
def read_table(path: str, schema: Dict = None, **kwargs) -> pl.LazyFrame:
    """
     Read a tab-delimited (i.e. BED) file into a Polars LazyFrame.
     Tries to be compatible with Bioframe's [read_table](https://bioframe.readthedocs.io/en/latest/guide-io.html)
     but faster and lazy. Schema should follow the Bioframe's schema [format](https://github.com/open2c/bioframe/blob/2b685eebef393c2c9e6220dcf550b3630d87518e/bioframe/io/schemas.py#L174).

    Parameters:
        path: The path to the file.
        schema: Schema should follow the Bioframe's schema [format](https://github.com/open2c/bioframe/blob/2b685eebef393c2c9e6220dcf550b3630d87518e/bioframe/io/schemas.py#L174).


    """
    df = pl.scan_csv(path, separator="\t", has_header=False, **kwargs)
    if schema is not None:
        columns = SCHEMAS[schema]
        if len(columns) != len(df.collect_schema()):
            raise ValueError(
                f"Schema incompatible with the input. Expected {len(columns)} columns in a schema, got {len(df.collect_schema())} in the input data file. Please provide a valid schema."
            )
        for i, c in enumerate(columns):
            df = df.rename({f"column_{i+1}": c})
    return df

read_vcf(path, info_fields=None, thread_num=1, chunk_size=8, concurrent_fetches=1, streaming=False)

Read a VCF file into a LazyFrame.

Parameters:

Name Type Description Default
path str

The path to the VCF file.

required
info_fields Union[list[str], None]

The fields to read from the INFO column.

None
thread_num int

The number of threads to use for reading the VCF file. Used only for parallel decompression of BGZF blocks. Works only for local files.

1
chunk_size int

The size in MB of a chunk when reading from an object store. The default is 8 MB. For large scale operations, it is recommended to increase this value to 64.

8
concurrent_fetches int

The number of concurrent fetches when reading from an object store. The default is 1. For large scale operations, it is recommended to increase this value to 8 or even more.

1
streaming bool

Whether to read the VCF file in streaming mode.

False

Note

VCF reader uses 1-based coordinate system for the start and end columns.

Source code in polars_bio/io.py
def read_vcf(
    path: str,
    info_fields: Union[list[str], None] = None,
    thread_num: int = 1,
    chunk_size: int = 8,
    concurrent_fetches: int = 1,
    streaming: bool = False,
) -> Union[pl.LazyFrame, pl.DataFrame]:
    """
    Read a VCF file into a LazyFrame.

    Parameters:
        path: The path to the VCF file.
        info_fields: The fields to read from the INFO column.
        thread_num: The number of threads to use for reading the VCF file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
        chunk_size: The size in MB of a chunk when reading from an object store. The default is 8 MB. For large scale operations, it is recommended to increase this value to 64.
        concurrent_fetches: The number of concurrent fetches when reading from an object store. The default is 1. For large scale operations, it is recommended to increase this value to 8 or even more.
        streaming: Whether to read the VCF file in streaming mode.

    !!! note
        VCF reader uses **1-based** coordinate system for the `start` and `end` columns.
    """
    vcf_read_options = VcfReadOptions(
        info_fields=_cleanse_infos(info_fields),
        thread_num=thread_num,
        chunk_size=chunk_size,
        concurrent_fetches=concurrent_fetches,
    )
    read_options = ReadOptions(vcf_read_options=vcf_read_options)
    if streaming:
        return read_file(path, InputFormat.Vcf, read_options, streaming)
    else:
        df = read_file(path, InputFormat.Vcf, read_options)
        return lazy_scan(df)

register_vcf(path, name=None, info_fields=None, thread_num=1, chunk_size=64, concurrent_fetches=8)

Register a VCF file as a Datafusion table.

Parameters:

Name Type Description Default
path str

The path to the VCF file.

required
name Union[str, None]

The name of the table. If None, the name of the table will be generated automatically based on the path.

None
info_fields Union[list[str], None]

The fields to read from the INFO column.

None
thread_num int

The number of threads to use for reading the VCF file. Used only for parallel decompression of BGZF blocks. Works only for local files.

1
chunk_size int

The size in MB of a chunk when reading from an object store. Default settings are optimized for large scale operations. For small scale (interactive) operations, it is recommended to decrease this value to 8-16.

64
concurrent_fetches int

The number of concurrent fetches when reading from an object store. Default settings are optimized for large scale operations. For small scale (interactive) operations, it is recommended to decrease this value to 1-2.

8

Note

VCF reader uses 1-based coordinate system for the start and end columns.

Example

import polars_bio as pb
pb.register_vcf("/tmp/gnomad.v4.1.sv.sites.vcf.gz")
INFO:polars_bio:Table: gnomad_v4_1_sv_sites_gz registered for path: /tmp/gnomad.v4.1.sv.sites.vcf.gz

Tip

chunk_size and concurrent_fetches can be adjusted according to the network bandwidth and the size of the VCF file. As a rule of thumb for large scale operations (reading a whole VCF), it is recommended to the default values.

Source code in polars_bio/io.py
def register_vcf(
    path: str,
    name: Union[str, None] = None,
    info_fields: Union[list[str], None] = None,
    thread_num: int = 1,
    chunk_size: int = 64,
    concurrent_fetches: int = 8,
) -> None:
    """
    Register a VCF file as a Datafusion table.

    Parameters:
        path: The path to the VCF file.
        name: The name of the table. If *None*, the name of the table will be generated automatically based on the path.
        info_fields: The fields to read from the INFO column.
        thread_num: The number of threads to use for reading the VCF file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
        chunk_size: The size in MB of a chunk when reading from an object store. Default settings are optimized for large scale operations. For small scale (interactive) operations, it is recommended to decrease this value to **8-16**.
        concurrent_fetches: The number of concurrent fetches when reading from an object store. Default settings are optimized for large scale operations. For small scale (interactive) operations, it is recommended to decrease this value to **1-2**.

    !!! note
        VCF reader uses **1-based** coordinate system for the `start` and `end` columns.

    !!! Example
          ```python
          import polars_bio as pb
          pb.register_vcf("/tmp/gnomad.v4.1.sv.sites.vcf.gz")
          ```
         ```shell
         INFO:polars_bio:Table: gnomad_v4_1_sv_sites_gz registered for path: /tmp/gnomad.v4.1.sv.sites.vcf.gz
         ```
    !!! tip
        `chunk_size` and `concurrent_fetches` can be adjusted according to the network bandwidth and the size of the VCF file. As a rule of thumb for large scale operations (reading a whole VCF), it is recommended to the default values.
    """

    vcf_read_options = VcfReadOptions(
        info_fields=_cleanse_infos(info_fields),
        thread_num=thread_num,
        chunk_size=chunk_size,
        concurrent_fetches=concurrent_fetches,
    )
    read_options = ReadOptions(vcf_read_options=vcf_read_options)
    py_register_table(ctx, path, name, InputFormat.Vcf, read_options)

register_view(name, query)

Register a query as a Datafusion view. This view can be used in genomic ranges operations, such as overlap, nearest, and count_overlaps. It is useful for filtering, transforming, and aggregating data prior to the range operation. When combined with the range operation, it can be used to perform complex in a streaming fashion end-to-end.

Parameters:

Name Type Description Default
name str

The name of the table.

required
query str

The SQL query.

required

Example

import polars_bio as pb
pb.register_vcf("gs://gcp-public-data--gnomad/release/4.1/vcf/exomes/gnomad.exomes.v4.1.sites.chr21.vcf.bgz", "gnomad_sv")
pb.register_view("v_gnomad_sv", "SELECT replace(chrom,'chr', '') AS chrom, start, end FROM gnomad_sv")
pb.sql("SELECT * FROM v_gnomad_sv").limit(5).collect()
  shape: (5, 3)
  β”Œβ”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
  β”‚ chrom ┆ start   ┆ end     β”‚
  β”‚ ---   ┆ ---     ┆ ---     β”‚
  β”‚ str   ┆ u32     ┆ u32     β”‚
  β•žβ•β•β•β•β•β•β•β•ͺ═════════β•ͺ═════════║
  β”‚ 21    ┆ 5031905 ┆ 5031905 β”‚
  β”‚ 21    ┆ 5031905 ┆ 5031905 β”‚
  β”‚ 21    ┆ 5031909 ┆ 5031909 β”‚
  β”‚ 21    ┆ 5031911 ┆ 5031911 β”‚
  β”‚ 21    ┆ 5031911 ┆ 5031911 β”‚
  β””β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
Source code in polars_bio/io.py
def register_view(name: str, query: str) -> None:
    """
    Register a query as a Datafusion view. This view can be used in genomic ranges operations,
    such as overlap, nearest, and count_overlaps. It is useful for filtering, transforming, and aggregating data
    prior to the range operation. When combined with the range operation, it can be used to perform complex in a streaming fashion end-to-end.

    Parameters:
        name: The name of the table.
        query: The SQL query.

    !!! Example
          ```python
          import polars_bio as pb
          pb.register_vcf("gs://gcp-public-data--gnomad/release/4.1/vcf/exomes/gnomad.exomes.v4.1.sites.chr21.vcf.bgz", "gnomad_sv")
          pb.register_view("v_gnomad_sv", "SELECT replace(chrom,'chr', '') AS chrom, start, end FROM gnomad_sv")
          pb.sql("SELECT * FROM v_gnomad_sv").limit(5).collect()
          ```
          ```shell
            shape: (5, 3)
            β”Œβ”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”¬β”€β”€β”€β”€β”€β”€β”€β”€β”€β”
            β”‚ chrom ┆ start   ┆ end     β”‚
            β”‚ ---   ┆ ---     ┆ ---     β”‚
            β”‚ str   ┆ u32     ┆ u32     β”‚
            β•žβ•β•β•β•β•β•β•β•ͺ═════════β•ͺ═════════║
            β”‚ 21    ┆ 5031905 ┆ 5031905 β”‚
            β”‚ 21    ┆ 5031905 ┆ 5031905 β”‚
            β”‚ 21    ┆ 5031909 ┆ 5031909 β”‚
            β”‚ 21    ┆ 5031911 ┆ 5031911 β”‚
            β”‚ 21    ┆ 5031911 ┆ 5031911 β”‚
            β””β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”΄β”€β”€β”€β”€β”€β”€β”€β”€β”€β”˜
          ```
    """
    py_register_view(ctx, name, query)

sql(query, streaming=False)

Execute a SQL query on the registered tables.

Parameters:

Name Type Description Default
query str

The SQL query.

required
streaming bool

Whether to execute the query in streaming mode.

False

Example

import polars_bio as pb
pb.register_vcf("/tmp/gnomad.v4.1.sv.sites.vcf.gz", "gnomad_v4_1_sv")
pb.sql("SELECT * FROM gnomad_v4_1_sv LIMIT 5").collect()
Source code in polars_bio/io.py
def sql(query: str, streaming: bool = False) -> pl.LazyFrame:
    """
    Execute a SQL query on the registered tables.

    Parameters:
        query: The SQL query.
        streaming: Whether to execute the query in streaming mode.

    !!! Example
          ```python
          import polars_bio as pb
          pb.register_vcf("/tmp/gnomad.v4.1.sv.sites.vcf.gz", "gnomad_v4_1_sv")
          pb.sql("SELECT * FROM gnomad_v4_1_sv LIMIT 5").collect()
          ```
    """
    if streaming:
        return stream_wrapper(py_scan_sql(ctx, query))
    else:
        df = py_read_sql(ctx, query)
        return lazy_scan(df)

visualize_intervals(df, label='overlapping pair')

Visualize the overlapping intervals.

Parameters:

Name Type Description Default
df Union[DataFrame, DataFrame]

Pandas DataFrame or Polars DataFrame. The DataFrame containing the overlapping intervals

required
label str

TBD

'overlapping pair'
Source code in polars_bio/range_viz.py
def visualize_intervals(
    df: Union[pd.DataFrame, pl.DataFrame], label: str = "overlapping pair"
) -> None:
    """
    Visualize the overlapping intervals.

    Parameters:
        df: Pandas DataFrame or Polars DataFrame. The DataFrame containing the overlapping intervals
        label: TBD

    """
    assert isinstance(
        df, (pd.DataFrame, pl.DataFrame)
    ), "df must be a Pandas or Polars DataFrame"
    df = df if isinstance(df, pd.DataFrame) else df.to_pandas()
    for i, reg_pair in df.iterrows():
        bf.vis.plot_intervals_arr(
            starts=[reg_pair.start_1, reg_pair.start_2],
            ends=[reg_pair.end_1, reg_pair.end_2],
            colors=["skyblue", "lightpink"],
            levels=[2, 1],
            xlim=(0, 16),
            show_coords=True,
        )
        plt.title(f"{label} #{i}")