⚙️ API reference

polars-bio API is grouped into the following categories:

File I/O: Reading files in various biological formats from local and cloud storage.
Data Processing: Exposing end user to the rich SQL programming interface powered by Apache Datafusion for operations, such as sorting, filtering and other transformations on input bioinformatic datasets registered as tables. You can easily query and process file formats such as VCF, GFF, BAM, FASTQ using SQL syntax.
Interval Operations: Functions for performing common interval operations, such as overlap, nearest, coverage.

There are 2 ways of using polars-bio API:

using polars_bio module

Example

import polars_bio as pb
pb.read_fastq("gs://genomics-public-data/platinum-genomes/fastq/ERR194146.fastq.gz").limit(1).collect()

directly on a Polars LazyFrame under a registered pb namespace

Example

 >>> type(df)
 <class 'polars.lazyframe.frame.LazyFrame'>

   import polars_bio as pb
   df.pb.sort().limit(5).collect()

Tip

Not all are available in both ways.
You can of course use both ways in the same script.

`data_input`

Source code in polars_bio/io.py

class IOOperations:

    # TODO handling reference
    # def read_cram(path: str) -> pl.LazyFrame:
    #     """
    #     Read a CRAM file into a LazyFrame.
    #
    #     Parameters:
    #         path: The path to the CRAM file.
    #     """
    #     return file_lazy_scan(path, InputFormat.Cram)

    # TODO passing of bam_region_filter
    # def read_indexed_bam(path: str) -> pl.LazyFrame:
    #     """
    #     Read an indexed BAM file into a LazyFrame.
    #
    #     Parameters:
    #         path: The path to the BAM file.
    #
    #     !!! warning
    #         Predicate pushdown is not supported yet. So no real benefit from using an indexed BAM file.
    #     """
    #     return file_lazy_scan(path, InputFormat.IndexedBam)

    @staticmethod
    def read_vcf(
        path: str,
        info_fields: Union[list[str], None] = None,
        thread_num: int = 1,
        chunk_size: int = 8,
        concurrent_fetches: int = 1,
        allow_anonymous: bool = True,
        enable_request_payer: bool = False,
        max_retries: int = 5,
        timeout: int = 300,
        compression_type: str = "auto",
        streaming: bool = False,
    ) -> Union[pl.LazyFrame, pl.DataFrame]:
        """
        Read a VCF file into a LazyFrame.

        Parameters:
            path: The path to the VCF file.
            info_fields: The fields to read from the INFO column.
            thread_num: The number of threads to use for reading the VCF file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
            chunk_size: The size in MB of a chunk when reading from an object store. The default is 8 MB. For large scale operations, it is recommended to increase this value to 64.
            concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large scale operations, it is recommended to increase this value to 8 or even more.
            allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
            enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
            max_retries:  The maximum number of retries for reading the file from object storage.
            timeout: The timeout in seconds for reading the file from object storage.
            compression_type: The compression type of the VCF file. If not specified, it will be detected automatically based on the file extension. BGZF compression is supported ('bgz').
            streaming: Whether to read the VCF file in streaming mode.

        !!! note
            VCF reader uses **1-based** coordinate system for the `start` and `end` columns.
        """
        object_storage_options = PyObjectStorageOptions(
            allow_anonymous=allow_anonymous,
            enable_request_payer=enable_request_payer,
            chunk_size=chunk_size,
            concurrent_fetches=concurrent_fetches,
            max_retries=max_retries,
            timeout=timeout,
            compression_type=compression_type,
        )

        vcf_read_options = VcfReadOptions(
            info_fields=_cleanse_fields(info_fields),
            thread_num=thread_num,
            object_storage_options=object_storage_options,
        )
        read_options = ReadOptions(vcf_read_options=vcf_read_options)
        if streaming:
            return read_file(path, InputFormat.Vcf, read_options, streaming)
        else:
            df = read_file(path, InputFormat.Vcf, read_options)
            return lazy_scan(df)

    @staticmethod
    def read_gff(
        path: str,
        attr_fields: Union[list[str], None] = None,
        thread_num: int = 1,
        chunk_size: int = 8,
        concurrent_fetches: int = 1,
        allow_anonymous: bool = True,
        enable_request_payer: bool = False,
        max_retries: int = 5,
        timeout: int = 300,
        compression_type: str = "auto",
        streaming: bool = False,
    ) -> Union[pl.LazyFrame, pl.DataFrame]:
        """
        Read a GFF file into a LazyFrame.

        Parameters:
            path: The path to the GFF file.
            attr_fields: The fields to unnest from the `attributes` column. If not specified, all fields swill be rendered as `attributes` column containing an array of structures `{'tag':'xxx', 'value':'yyy'}`.
            thread_num: The number of threads to use for reading the GFF file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
            chunk_size: The size in MB of a chunk when reading from an object store. The default is 8 MB. For large scale operations, it is recommended to increase this value to 64.
            concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large scale operations, it is recommended to increase this value to 8 or even more.
            allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
            enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
            max_retries:  The maximum number of retries for reading the file from object storage.
            timeout: The timeout in seconds for reading the file from object storage.
            compression_type: The compression type of the GFF file. If not specified, it will be detected automatically based on the file extension. BGZF compression is supported ('bgz').
            streaming: Whether to read the GFF file in streaming mode.


        !!! Example
            ```shell
            wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_38/gencode.v38.annotation.gff3.gz -O /tmp/gencode.v38.annotation.gff3.gz
            ```
            Read a GFF file **without** unnesting attributes:
            ```python
            import polars_bio as pb
            gff_path = "/tmp/gencode.v38.annotation.gff3.gz"
            pb.read_gff(gff_path).limit(5).collect()
            ```

            ```shell

            shape: (5, 9)
            ┌───────┬───────┬───────┬────────────┬───┬───────┬────────┬───────┬─────────────────────────────────┐
            │ chrom ┆ start ┆ end   ┆ type       ┆ … ┆ score ┆ strand ┆ phase ┆ attributes                      │
            │ ---   ┆ ---   ┆ ---   ┆ ---        ┆   ┆ ---   ┆ ---    ┆ ---   ┆ ---                             │
            │ str   ┆ u32   ┆ u32   ┆ str        ┆   ┆ f32   ┆ str    ┆ u32   ┆ list[struct[2]]                 │
            ╞═══════╪═══════╪═══════╪════════════╪═══╪═══════╪════════╪═══════╪═════════════════════════════════╡
            │ chr1  ┆ 11869 ┆ 14409 ┆ gene       ┆ … ┆ null  ┆ +      ┆ null  ┆ [{"ID","ENSG00000223972.5"}, {… │
            │ chr1  ┆ 11869 ┆ 14409 ┆ transcript ┆ … ┆ null  ┆ +      ┆ null  ┆ [{"ID","ENST00000456328.2"}, {… │
            │ chr1  ┆ 11869 ┆ 12227 ┆ exon       ┆ … ┆ null  ┆ +      ┆ null  ┆ [{"ID","exon:ENST00000456328.2… │
            │ chr1  ┆ 12613 ┆ 12721 ┆ exon       ┆ … ┆ null  ┆ +      ┆ null  ┆ [{"ID","exon:ENST00000456328.2… │
            │ chr1  ┆ 13221 ┆ 14409 ┆ exon       ┆ … ┆ null  ┆ +      ┆ null  ┆ [{"ID","exon:ENST00000456328.2… │
            └───────┴───────┴───────┴────────────┴───┴───────┴────────┴───────┴─────────────────────────────────┘

            ```

            Read a GFF file **with** unnesting attributes:
            ```python
            import polars_bio as pb
            gff_path = "/tmp/gencode.v38.annotation.gff3.gz"
            pb.read_gff(gff_path, attr_fields=["ID", "havana_transcript"]).limit(5).collect()
            ```
            ```shell

            shape: (5, 10)
            ┌───────┬───────┬───────┬────────────┬───┬────────┬───────┬──────────────────────────┬──────────────────────┐
            │ chrom ┆ start ┆ end   ┆ type       ┆ … ┆ strand ┆ phase ┆ ID                       ┆ havana_transcript    │
            │ ---   ┆ ---   ┆ ---   ┆ ---        ┆   ┆ ---    ┆ ---   ┆ ---                      ┆ ---                  │
            │ str   ┆ u32   ┆ u32   ┆ str        ┆   ┆ str    ┆ u32   ┆ str                      ┆ str                  │
            ╞═══════╪═══════╪═══════╪════════════╪═══╪════════╪═══════╪══════════════════════════╪══════════════════════╡
            │ chr1  ┆ 11869 ┆ 14409 ┆ gene       ┆ … ┆ +      ┆ null  ┆ ENSG00000223972.5        ┆ null                 │
            │ chr1  ┆ 11869 ┆ 14409 ┆ transcript ┆ … ┆ +      ┆ null  ┆ ENST00000456328.2        ┆ OTTHUMT00000362751.1 │
            │ chr1  ┆ 11869 ┆ 12227 ┆ exon       ┆ … ┆ +      ┆ null  ┆ exon:ENST00000456328.2:1 ┆ OTTHUMT00000362751.1 │
            │ chr1  ┆ 12613 ┆ 12721 ┆ exon       ┆ … ┆ +      ┆ null  ┆ exon:ENST00000456328.2:2 ┆ OTTHUMT00000362751.1 │
            │ chr1  ┆ 13221 ┆ 14409 ┆ exon       ┆ … ┆ +      ┆ null  ┆ exon:ENST00000456328.2:3 ┆ OTTHUMT00000362751.1 │
            └───────┴───────┴───────┴────────────┴───┴────────┴───────┴──────────────────────────┴──────────────────────┘
            ```
        !!! note
            GFF reader uses **1-based** coordinate system for the `start` and `end` columns.
        """
        object_storage_options = PyObjectStorageOptions(
            allow_anonymous=allow_anonymous,
            enable_request_payer=enable_request_payer,
            chunk_size=chunk_size,
            concurrent_fetches=concurrent_fetches,
            max_retries=max_retries,
            timeout=timeout,
            compression_type=compression_type,
        )

        gff_read_options = GffReadOptions(
            attr_fields=_cleanse_fields(attr_fields),
            thread_num=thread_num,
            object_storage_options=object_storage_options,
        )
        read_options = ReadOptions(gff_read_options=gff_read_options)
        if streaming:
            return read_file(path, InputFormat.Gff, read_options, streaming)
        else:
            df = read_file(path, InputFormat.Gff, read_options)
            return lazy_scan(df)

    @staticmethod
    def read_bam(
        path: str,
        thread_num: int = 1,
        chunk_size: int = 8,
        concurrent_fetches: int = 1,
        allow_anonymous: bool = True,
        enable_request_payer: bool = False,
        max_retries: int = 5,
        timeout: int = 300,
        streaming: bool = False,
    ) -> Union[pl.LazyFrame, pl.DataFrame]:
        """
        Read a BAM file into a LazyFrame.

        Parameters:
            path: The path to the BAM file.
            thread_num: The number of threads to use for reading the BAM file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
            chunk_size: The size in MB of a chunk when reading from an object store. The default is 8 MB. For large scale operations, it is recommended to increase this value to 64.
            concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large scale operations, it is recommended to increase this value to 8 or even more.
            allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
            enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
            max_retries:  The maximum number of retries for reading the file from object storage.
            timeout: The timeout in seconds for reading the file from object storage.
            streaming: Whether to read the BAM file in streaming mode.

        !!! Example

            ```python
            import polars_bio as pb
            bam = pb.read_bam("gs://genomics-public-data/1000-genomes/bam/HG00096.mapped.ILLUMINA.bwa.GBR.low_coverage.20120522.bam").limit(3)
            bam.collect()
            ```
            ```shell
            INFO:polars_bio:Table: hg00096_mapped_illumina_bwa_gbr_low_coverage_20120522 registered for path: gs://genomics-public-data/1000-genomes/bam/HG00096.mapped.ILLUMINA.bwa.GBR.low_coverage.20120522.bam
            shape: (3, 11)
            ┌────────────────────┬───────┬───────┬───────┬───┬────────────┬────────────┬─────────────────────────────────┬─────────────────────────────────┐
            │ name               ┆ chrom ┆ start ┆ end   ┆ … ┆ mate_chrom ┆ mate_start ┆ sequence                        ┆ quality_scores                  │
            │ ---                ┆ ---   ┆ ---   ┆ ---   ┆   ┆ ---        ┆ ---        ┆ ---                             ┆ ---                             │
            │ str                ┆ str   ┆ u32   ┆ u32   ┆   ┆ str        ┆ u32        ┆ str                             ┆ str                             │
            ╞════════════════════╪═══════╪═══════╪═══════╪═══╪════════════╪════════════╪═════════════════════════════════╪═════════════════════════════════╡
            │ SRR062634.9882510  ┆ chr1  ┆ 10001 ┆ 10044 ┆ … ┆ chr1       ┆ 10069      ┆ TAACCCTAACCCTACCCTAACCCTAACCCT… ┆ 0<>=/0E:7;08FBDIF9;2%=<>+FCDDA… │
            │ SRR062641.21956756 ┆ chr1  ┆ 10001 ┆ 10049 ┆ … ┆ chr1       ┆ 10051      ┆ TAACCCTACCCTAACCCTAACCCTAACCCT… ┆ 0=MLOOPNNPPJHPOQQROQPQQRIQPRJB… │
            │ SRR062641.13613107 ┆ chr1  ┆ 10002 ┆ 10072 ┆ … ┆ chr1       ┆ 10110      ┆ AACCCTAACCCCTAACCCCTAACCCCTAAC… ┆ 0KKNPQOQOQIQRPQPRRRRPQPRRRRPRF… │
            └────────────────────┴───────┴───────┴───────┴───┴────────────┴────────────┴─────────────────────────────────┴─────────────────────────────────┘
            ```

            ```python
            bam.collect_schema()
            Schema({'name': String, 'chrom': String, 'start': UInt32, 'end': UInt32, 'flags': UInt32, 'cigar': String, 'mapping_quality': UInt32, 'mate_chrom': String, 'mate_start': UInt32, 'sequence': String, 'quality_scores': String})
            ```

        !!! note
            BAM reader uses **1-based** coordinate system for the `start`, `end`, `mate_start`, `mate_end` columns.
        """
        object_storage_options = PyObjectStorageOptions(
            allow_anonymous=allow_anonymous,
            enable_request_payer=enable_request_payer,
            chunk_size=chunk_size,
            concurrent_fetches=concurrent_fetches,
            max_retries=max_retries,
            timeout=timeout,
            compression_type="auto",
        )

        bam_read_options = BamReadOptions(
            thread_num=thread_num,
            object_storage_options=object_storage_options,
        )
        read_options = ReadOptions(bam_read_options=bam_read_options)
        if streaming:
            return read_file(path, InputFormat.Bam, read_options, streaming)
        else:
            df = read_file(path, InputFormat.Bam, read_options)
            return lazy_scan(df)

    @staticmethod
    def read_fastq(
        path: str,
        thread_num: int = 1,
        chunk_size: int = 8,
        concurrent_fetches: int = 1,
        allow_anonymous: bool = True,
        enable_request_payer: bool = False,
        max_retries: int = 5,
        timeout: int = 300,
        compression_type: str = "auto",
        streaming: bool = False,
    ) -> Union[pl.LazyFrame, pl.DataFrame]:
        """
        Read a FASTQ file into a LazyFrame.

        Parameters:
            path: The path to the FASTQ file.
            thread_num: The number of threads to use for reading the FASTQ file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
            chunk_size: The size in MB of a chunk when reading from an object store. The default is 8 MB. For large scale operations, it is recommended to increase this value to 64.
            concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large scale operations, it is recommended to increase this value to 8 or even more.
            allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
            enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
            max_retries:  The maximum number of retries for reading the file from object storage.
            timeout: The timeout in seconds for reading the file from object storage.
            compression_type: The compression type of the FASTQ file. If not specified, it will be detected automatically based on the file extension. BGZF and GZIP compressions are supported ('bgz', 'gz').
            streaming: Whether to read the FASTQ file in streaming mode.

        !!! Example

            ```python
            import polars_bio as pb
            pb.read_fastq("gs://genomics-public-data/platinum-genomes/fastq/ERR194146.fastq.gz").limit(1).collect()
            ```
            ```shell
            shape: (1, 4)
            ┌─────────────────────┬─────────────────────────────────┬─────────────────────────────────┬─────────────────────────────────┐
            │ name                ┆ description                     ┆ sequence                        ┆ quality_scores                  │
            │ ---                 ┆ ---                             ┆ ---                             ┆ ---                             │
            │ str                 ┆ str                             ┆ str                             ┆ str                             │
            ╞═════════════════════╪═════════════════════════════════╪═════════════════════════════════╪═════════════════════════════════╡
            │ ERR194146.812444541 ┆ HSQ1008:141:D0CC8ACXX:2:1204:1… ┆ TGGAAGGTTCTCGAAAAAAATGGAATCGAA… ┆ ?@;DDBDDBHF??FFB@B)1:CD3*:?DFF… │
            └─────────────────────┴─────────────────────────────────┴─────────────────────────────────┴─────────────────────────────────┘

            ```
        """

        object_storage_options = PyObjectStorageOptions(
            allow_anonymous=allow_anonymous,
            enable_request_payer=enable_request_payer,
            chunk_size=chunk_size,
            concurrent_fetches=concurrent_fetches,
            max_retries=max_retries,
            timeout=timeout,
            compression_type=compression_type,
        )

        fastq_read_options = FastqReadOptions(
            thread_num=thread_num,
            object_storage_options=object_storage_options,
        )
        read_options = ReadOptions(fastq_read_options=fastq_read_options)
        if streaming:
            return read_file(path, InputFormat.Fastq, read_options, streaming)
        else:
            df = read_file(path, InputFormat.Fastq, read_options)
            return lazy_scan(df)

    @staticmethod
    def read_bed(
        path: str,
        thread_num: int = 1,
        chunk_size: int = 8,
        concurrent_fetches: int = 1,
        allow_anonymous: bool = True,
        enable_request_payer: bool = False,
        max_retries: int = 5,
        timeout: int = 300,
        compression_type: str = "auto",
        streaming: bool = False,
    ) -> Union[pl.LazyFrame, pl.DataFrame]:
        """
        Read a BED file into a LazyFrame.

        Parameters:
            path: The path to the BED file.
            thread_num: The number of threads to use for reading the BED file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
            chunk_size: The size in MB of a chunk when reading from an object store. The default is 8 MB. For large scale operations, it is recommended to increase this value to 64.
            concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large scale operations, it is recommended to increase this value to 8 or even more.
            allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
            enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
            max_retries:  The maximum number of retries for reading the file from object storage.
            timeout: The timeout in seconds for reading the file from object storage.
            compression_type: The compression type of the BED file. If not specified, it will be detected automatically based on the file extension. BGZF compressions is supported ('bgz').
            streaming: Whether to read the BED file in streaming mode.

        !!! Note
            Only **BED4** format is supported. It extends the basic BED format (BED3) by adding a name field, resulting in four columns: chromosome, start position, end position, and name.
            Also unlike other text formats, **GZIP** compression is not supported.

        !!! Example
            ```shell

             cd /tmp
             wget https://webs.iiitd.edu.in/raghava/humcfs/fragile_site_bed.zip -O fragile_site_bed.zip
             unzip fragile_site_bed.zip -x "__MACOSX/*" "*/.DS_Store"
            ```

            ```python
            import polars_bio as pb
            pb.read_bed("/tmp/fragile_site_bed/chr5_fragile_site.bed").limit(5).collect()
            ```

            ```shell

            shape: (5, 4)
            ┌───────┬───────────┬───────────┬───────┐
            │ chrom ┆ start     ┆ end       ┆ name  │
            │ ---   ┆ ---       ┆ ---       ┆ ---   │
            │ str   ┆ u32       ┆ u32       ┆ str   │
            ╞═══════╪═══════════╪═══════════╪═══════╡
            │ chr5  ┆ 28900001  ┆ 42500000  ┆ FRA5A │
            │ chr5  ┆ 92300001  ┆ 98200000  ┆ FRA5B │
            │ chr5  ┆ 130600001 ┆ 136200000 ┆ FRA5C │
            │ chr5  ┆ 92300001  ┆ 93916228  ┆ FRA5D │
            │ chr5  ┆ 18400001  ┆ 28900000  ┆ FRA5E │
            └───────┴───────────┴───────────┴───────┘
            ```
        !!! note
            BED reader uses **1-based** coordinate system for the `start`, `end`.
        """

        object_storage_options = PyObjectStorageOptions(
            allow_anonymous=allow_anonymous,
            enable_request_payer=enable_request_payer,
            chunk_size=chunk_size,
            concurrent_fetches=concurrent_fetches,
            max_retries=max_retries,
            timeout=timeout,
            compression_type=compression_type,
        )

        bed_read_options = BedReadOptions(
            thread_num=thread_num,
            object_storage_options=object_storage_options,
        )
        read_options = ReadOptions(bed_read_options=bed_read_options)
        if streaming:
            return read_file(path, InputFormat.Bed, read_options, streaming)
        else:
            df = read_file(path, InputFormat.Bed, read_options)
            return lazy_scan(df)

    @staticmethod
    def read_table(path: str, schema: Dict = None, **kwargs) -> pl.LazyFrame:
        """
         Read a tab-delimited (i.e. BED) file into a Polars LazyFrame.
         Tries to be compatible with Bioframe's [read_table](https://bioframe.readthedocs.io/en/latest/guide-io.html)
         but faster and lazy. Schema should follow the Bioframe's schema [format](https://github.com/open2c/bioframe/blob/2b685eebef393c2c9e6220dcf550b3630d87518e/bioframe/io/schemas.py#L174).

        Parameters:
            path: The path to the file.
            schema: Schema should follow the Bioframe's schema [format](https://github.com/open2c/bioframe/blob/2b685eebef393c2c9e6220dcf550b3630d87518e/bioframe/io/schemas.py#L174).


        """
        df = pl.scan_csv(path, separator="\t", has_header=False, **kwargs)
        if schema is not None:
            columns = SCHEMAS[schema]
            if len(columns) != len(df.collect_schema()):
                raise ValueError(
                    f"Schema incompatible with the input. Expected {len(columns)} columns in a schema, got {len(df.collect_schema())} in the input data file. Please provide a valid schema."
                )
            for i, c in enumerate(columns):
                df = df.rename({f"column_{i+1}": c})
        return df

    @staticmethod
    def describe_vcf(
        path: str,
        allow_anonymous: bool = True,
        enable_request_payer: bool = False,
        compression_type: str = "auto",
    ) -> pl.DataFrame:
        """
        Describe VCF INFO schema.

        Parameters:
            path: The path to the VCF file.
            allow_anonymous: Whether to allow anonymous access to object storage (GCS and S3 supported).
            enable_request_payer: Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
            compression_type: The compression type of the VCF file. If not specified, it will be detected automatically based on the file extension. BGZF compression is supported ('bgz').

        !!! Example
            ```python
            import polars_bio as pb
            vcf_1 = "gs://gcp-public-data--gnomad/release/4.1/genome_sv/gnomad.v4.1.sv.sites.vcf.gz"
            pb.describe_vcf(vcf_1, allow_anonymous=True).sort("name").limit(5)
            ```

            ```shell
                shape: (5, 3)
            ┌───────────┬─────────┬──────────────────────────────────────────────────────────────────────────────────────┐
            │ name      ┆ type    ┆ description                                                                          │
            │ ---       ┆ ---     ┆ ---                                                                                  │
            │ str       ┆ str     ┆ str                                                                                  │
            ╞═══════════╪═════════╪══════════════════════════════════════════════════════════════════════════════════════╡
            │ AC        ┆ Integer ┆ Number of non-reference alleles observed (biallelic sites only).                     │
            │ AC_XX     ┆ Integer ┆ Number of non-reference XX alleles observed (biallelic sites only).                  │
            │ AC_XY     ┆ Integer ┆ Number of non-reference XY alleles observed (biallelic sites only).                  │
            │ AC_afr    ┆ Integer ┆ Number of non-reference African-American alleles observed (biallelic sites only).    │
            │ AC_afr_XX ┆ Integer ┆ Number of non-reference African-American XX alleles observed (biallelic sites only). │
            └───────────┴─────────┴──────────────────────────────────────────────────────────────────────────────────────┘


            ```
        """
        object_storage_options = PyObjectStorageOptions(
            allow_anonymous=allow_anonymous,
            enable_request_payer=enable_request_payer,
            chunk_size=8,
            concurrent_fetches=1,
            max_retries=1,
            timeout=10,
            compression_type=compression_type,
        )
        return py_describe_vcf(ctx, path, object_storage_options).to_polars()

    @staticmethod
    def from_polars(name: str, df: Union[pl.DataFrame, pl.LazyFrame]) -> None:
        """
        Register a Polars DataFrame as a DataFusion table.

        Parameters:
            name: The name of the table.
            df: The Polars DataFrame.
        !!! Example
            ```python
            import polars as pl
            import polars_bio as pb
            df = pl.DataFrame({
                "a": [1, 2, 3],
                "b": [4, 5, 6]
            })
            pb.from_polars("test_df", df)
            pb.sql("SELECT * FROM test_df").collect()
            ```
            ```shell
            3rows [00:00, 2978.91rows/s]
            shape: (3, 2)
            ┌─────┬─────┐
            │ a   ┆ b   │
            │ --- ┆ --- │
            │ i64 ┆ i64 │
            ╞═════╪═════╡
            │ 1   ┆ 4   │
            │ 2   ┆ 5   │
            │ 3   ┆ 6   │
            └─────┴─────┘
            ```
        """
        reader = (
            df.to_arrow()
            if isinstance(df, pl.DataFrame)
            else df.collect().to_arrow().to_reader()
        )
        py_from_polars(ctx, name, reader)

`describe_vcf(path, allow_anonymous=True, enable_request_payer=False, compression_type='auto')` `staticmethod`

Describe VCF INFO schema.

Parameters:

Name	Type	Description	Default
`path`	`str`	The path to the VCF file.	required
`allow_anonymous`	`bool`	Whether to allow anonymous access to object storage (GCS and S3 supported).	`True`
`enable_request_payer`	`bool`	Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.	`False`
`compression_type`	`str`	The compression type of the VCF file. If not specified, it will be detected automatically based on the file extension. BGZF compression is supported ('bgz').	`'auto'`

Example

import polars_bio as pb
vcf_1 = "gs://gcp-public-data--gnomad/release/4.1/genome_sv/gnomad.v4.1.sv.sites.vcf.gz"
pb.describe_vcf(vcf_1, allow_anonymous=True).sort("name").limit(5)

    shape: (5, 3)
┌───────────┬─────────┬──────────────────────────────────────────────────────────────────────────────────────┐
│ name      ┆ type    ┆ description                                                                          │
│ ---       ┆ ---     ┆ ---                                                                                  │
│ str       ┆ str     ┆ str                                                                                  │
╞═══════════╪═════════╪══════════════════════════════════════════════════════════════════════════════════════╡
│ AC        ┆ Integer ┆ Number of non-reference alleles observed (biallelic sites only).                     │
│ AC_XX     ┆ Integer ┆ Number of non-reference XX alleles observed (biallelic sites only).                  │
│ AC_XY     ┆ Integer ┆ Number of non-reference XY alleles observed (biallelic sites only).                  │
│ AC_afr    ┆ Integer ┆ Number of non-reference African-American alleles observed (biallelic sites only).    │
│ AC_afr_XX ┆ Integer ┆ Number of non-reference African-American XX alleles observed (biallelic sites only). │
└───────────┴─────────┴──────────────────────────────────────────────────────────────────────────────────────┘

Source code in polars_bio/io.py

@staticmethod
def describe_vcf(
    path: str,
    allow_anonymous: bool = True,
    enable_request_payer: bool = False,
    compression_type: str = "auto",
) -> pl.DataFrame:
    """
    Describe VCF INFO schema.

    Parameters:
        path: The path to the VCF file.
        allow_anonymous: Whether to allow anonymous access to object storage (GCS and S3 supported).
        enable_request_payer: Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
        compression_type: The compression type of the VCF file. If not specified, it will be detected automatically based on the file extension. BGZF compression is supported ('bgz').

    !!! Example
        ```python
        import polars_bio as pb
        vcf_1 = "gs://gcp-public-data--gnomad/release/4.1/genome_sv/gnomad.v4.1.sv.sites.vcf.gz"
        pb.describe_vcf(vcf_1, allow_anonymous=True).sort("name").limit(5)
        ```

        ```shell
            shape: (5, 3)
        ┌───────────┬─────────┬──────────────────────────────────────────────────────────────────────────────────────┐
        │ name      ┆ type    ┆ description                                                                          │
        │ ---       ┆ ---     ┆ ---                                                                                  │
        │ str       ┆ str     ┆ str                                                                                  │
        ╞═══════════╪═════════╪══════════════════════════════════════════════════════════════════════════════════════╡
        │ AC        ┆ Integer ┆ Number of non-reference alleles observed (biallelic sites only).                     │
        │ AC_XX     ┆ Integer ┆ Number of non-reference XX alleles observed (biallelic sites only).                  │
        │ AC_XY     ┆ Integer ┆ Number of non-reference XY alleles observed (biallelic sites only).                  │
        │ AC_afr    ┆ Integer ┆ Number of non-reference African-American alleles observed (biallelic sites only).    │
        │ AC_afr_XX ┆ Integer ┆ Number of non-reference African-American XX alleles observed (biallelic sites only). │
        └───────────┴─────────┴──────────────────────────────────────────────────────────────────────────────────────┘


        ```
    """
    object_storage_options = PyObjectStorageOptions(
        allow_anonymous=allow_anonymous,
        enable_request_payer=enable_request_payer,
        chunk_size=8,
        concurrent_fetches=1,
        max_retries=1,
        timeout=10,
        compression_type=compression_type,
    )
    return py_describe_vcf(ctx, path, object_storage_options).to_polars()

`from_polars(name, df)` `staticmethod`

Register a Polars DataFrame as a DataFusion table.

Parameters:

Name	Type	Description	Default
`name`	`str`	The name of the table.	required
`df`	`Union[DataFrame, LazyFrame]`	The Polars DataFrame.	required

Example

import polars as pl
import polars_bio as pb
df = pl.DataFrame({
    "a": [1, 2, 3],
    "b": [4, 5, 6]
})
pb.from_polars("test_df", df)
pb.sql("SELECT * FROM test_df").collect()

3rows [00:00, 2978.91rows/s]
shape: (3, 2)
┌─────┬─────┐
│ a   ┆ b   │
│ --- ┆ --- │
│ i64 ┆ i64 │
╞═════╪═════╡
│ 1   ┆ 4   │
│ 2   ┆ 5   │
│ 3   ┆ 6   │
└─────┴─────┘

Source code in polars_bio/io.py

@staticmethod
def from_polars(name: str, df: Union[pl.DataFrame, pl.LazyFrame]) -> None:
    """
    Register a Polars DataFrame as a DataFusion table.

    Parameters:
        name: The name of the table.
        df: The Polars DataFrame.
    !!! Example
        ```python
        import polars as pl
        import polars_bio as pb
        df = pl.DataFrame({
            "a": [1, 2, 3],
            "b": [4, 5, 6]
        })
        pb.from_polars("test_df", df)
        pb.sql("SELECT * FROM test_df").collect()
        ```
        ```shell
        3rows [00:00, 2978.91rows/s]
        shape: (3, 2)
        ┌─────┬─────┐
        │ a   ┆ b   │
        │ --- ┆ --- │
        │ i64 ┆ i64 │
        ╞═════╪═════╡
        │ 1   ┆ 4   │
        │ 2   ┆ 5   │
        │ 3   ┆ 6   │
        └─────┴─────┘
        ```
    """
    reader = (
        df.to_arrow()
        if isinstance(df, pl.DataFrame)
        else df.collect().to_arrow().to_reader()
    )
    py_from_polars(ctx, name, reader)

`read_bam(path, thread_num=1, chunk_size=8, concurrent_fetches=1, allow_anonymous=True, enable_request_payer=False, max_retries=5, timeout=300, streaming=False)` `staticmethod`

Read a BAM file into a LazyFrame.

Parameters:

Name	Type	Description	Default
`path`	`str`	The path to the BAM file.	required
`thread_num`	`int`	The number of threads to use for reading the BAM file. Used only for parallel decompression of BGZF blocks. Works only for local files.	`1`
`chunk_size`	`int`	The size in MB of a chunk when reading from an object store. The default is 8 MB. For large scale operations, it is recommended to increase this value to 64.	`8`
`concurrent_fetches`	`int`	[GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large scale operations, it is recommended to increase this value to 8 or even more.	`1`
`allow_anonymous`	`bool`	[GCS, AWS S3] Whether to allow anonymous access to object storage.	`True`
`enable_request_payer`	`bool`	[AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.	`False`
`max_retries`	`int`	The maximum number of retries for reading the file from object storage.	`5`
`timeout`	`int`	The timeout in seconds for reading the file from object storage.	`300`
`streaming`	`bool`	Whether to read the BAM file in streaming mode.	`False`

Example

import polars_bio as pb
bam = pb.read_bam("gs://genomics-public-data/1000-genomes/bam/HG00096.mapped.ILLUMINA.bwa.GBR.low_coverage.20120522.bam").limit(3)
bam.collect()

INFO:polars_bio:Table: hg00096_mapped_illumina_bwa_gbr_low_coverage_20120522 registered for path: gs://genomics-public-data/1000-genomes/bam/HG00096.mapped.ILLUMINA.bwa.GBR.low_coverage.20120522.bam
shape: (3, 11)
┌────────────────────┬───────┬───────┬───────┬───┬────────────┬────────────┬─────────────────────────────────┬─────────────────────────────────┐
│ name               ┆ chrom ┆ start ┆ end   ┆ … ┆ mate_chrom ┆ mate_start ┆ sequence                        ┆ quality_scores                  │
│ ---                ┆ ---   ┆ ---   ┆ ---   ┆   ┆ ---        ┆ ---        ┆ ---                             ┆ ---                             │
│ str                ┆ str   ┆ u32   ┆ u32   ┆   ┆ str        ┆ u32        ┆ str                             ┆ str                             │
╞════════════════════╪═══════╪═══════╪═══════╪═══╪════════════╪════════════╪═════════════════════════════════╪═════════════════════════════════╡
│ SRR062634.9882510  ┆ chr1  ┆ 10001 ┆ 10044 ┆ … ┆ chr1       ┆ 10069      ┆ TAACCCTAACCCTACCCTAACCCTAACCCT… ┆ 0<>=/0E:7;08FBDIF9;2%=<>+FCDDA… │
│ SRR062641.21956756 ┆ chr1  ┆ 10001 ┆ 10049 ┆ … ┆ chr1       ┆ 10051      ┆ TAACCCTACCCTAACCCTAACCCTAACCCT… ┆ 0=MLOOPNNPPJHPOQQROQPQQRIQPRJB… │
│ SRR062641.13613107 ┆ chr1  ┆ 10002 ┆ 10072 ┆ … ┆ chr1       ┆ 10110      ┆ AACCCTAACCCCTAACCCCTAACCCCTAAC… ┆ 0KKNPQOQOQIQRPQPRRRRPQPRRRRPRF… │
└────────────────────┴───────┴───────┴───────┴───┴────────────┴────────────┴─────────────────────────────────┴─────────────────────────────────┘

bam.collect_schema()
Schema({'name': String, 'chrom': String, 'start': UInt32, 'end': UInt32, 'flags': UInt32, 'cigar': String, 'mapping_quality': UInt32, 'mate_chrom': String, 'mate_start': UInt32, 'sequence': String, 'quality_scores': String})

Note

BAM reader uses 1-based coordinate system for the start, end, mate_start, mate_end columns.

Source code in polars_bio/io.py

@staticmethod
def read_bam(
    path: str,
    thread_num: int = 1,
    chunk_size: int = 8,
    concurrent_fetches: int = 1,
    allow_anonymous: bool = True,
    enable_request_payer: bool = False,
    max_retries: int = 5,
    timeout: int = 300,
    streaming: bool = False,
) -> Union[pl.LazyFrame, pl.DataFrame]:
    """
    Read a BAM file into a LazyFrame.

    Parameters:
        path: The path to the BAM file.
        thread_num: The number of threads to use for reading the BAM file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
        chunk_size: The size in MB of a chunk when reading from an object store. The default is 8 MB. For large scale operations, it is recommended to increase this value to 64.
        concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large scale operations, it is recommended to increase this value to 8 or even more.
        allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
        enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
        max_retries:  The maximum number of retries for reading the file from object storage.
        timeout: The timeout in seconds for reading the file from object storage.
        streaming: Whether to read the BAM file in streaming mode.

    !!! Example

        ```python
        import polars_bio as pb
        bam = pb.read_bam("gs://genomics-public-data/1000-genomes/bam/HG00096.mapped.ILLUMINA.bwa.GBR.low_coverage.20120522.bam").limit(3)
        bam.collect()
        ```
        ```shell
        INFO:polars_bio:Table: hg00096_mapped_illumina_bwa_gbr_low_coverage_20120522 registered for path: gs://genomics-public-data/1000-genomes/bam/HG00096.mapped.ILLUMINA.bwa.GBR.low_coverage.20120522.bam
        shape: (3, 11)
        ┌────────────────────┬───────┬───────┬───────┬───┬────────────┬────────────┬─────────────────────────────────┬─────────────────────────────────┐
        │ name               ┆ chrom ┆ start ┆ end   ┆ … ┆ mate_chrom ┆ mate_start ┆ sequence                        ┆ quality_scores                  │
        │ ---                ┆ ---   ┆ ---   ┆ ---   ┆   ┆ ---        ┆ ---        ┆ ---                             ┆ ---                             │
        │ str                ┆ str   ┆ u32   ┆ u32   ┆   ┆ str        ┆ u32        ┆ str                             ┆ str                             │
        ╞════════════════════╪═══════╪═══════╪═══════╪═══╪════════════╪════════════╪═════════════════════════════════╪═════════════════════════════════╡
        │ SRR062634.9882510  ┆ chr1  ┆ 10001 ┆ 10044 ┆ … ┆ chr1       ┆ 10069      ┆ TAACCCTAACCCTACCCTAACCCTAACCCT… ┆ 0<>=/0E:7;08FBDIF9;2%=<>+FCDDA… │
        │ SRR062641.21956756 ┆ chr1  ┆ 10001 ┆ 10049 ┆ … ┆ chr1       ┆ 10051      ┆ TAACCCTACCCTAACCCTAACCCTAACCCT… ┆ 0=MLOOPNNPPJHPOQQROQPQQRIQPRJB… │
        │ SRR062641.13613107 ┆ chr1  ┆ 10002 ┆ 10072 ┆ … ┆ chr1       ┆ 10110      ┆ AACCCTAACCCCTAACCCCTAACCCCTAAC… ┆ 0KKNPQOQOQIQRPQPRRRRPQPRRRRPRF… │
        └────────────────────┴───────┴───────┴───────┴───┴────────────┴────────────┴─────────────────────────────────┴─────────────────────────────────┘
        ```

        ```python
        bam.collect_schema()
        Schema({'name': String, 'chrom': String, 'start': UInt32, 'end': UInt32, 'flags': UInt32, 'cigar': String, 'mapping_quality': UInt32, 'mate_chrom': String, 'mate_start': UInt32, 'sequence': String, 'quality_scores': String})
        ```

    !!! note
        BAM reader uses **1-based** coordinate system for the `start`, `end`, `mate_start`, `mate_end` columns.
    """
    object_storage_options = PyObjectStorageOptions(
        allow_anonymous=allow_anonymous,
        enable_request_payer=enable_request_payer,
        chunk_size=chunk_size,
        concurrent_fetches=concurrent_fetches,
        max_retries=max_retries,
        timeout=timeout,
        compression_type="auto",
    )

    bam_read_options = BamReadOptions(
        thread_num=thread_num,
        object_storage_options=object_storage_options,
    )
    read_options = ReadOptions(bam_read_options=bam_read_options)
    if streaming:
        return read_file(path, InputFormat.Bam, read_options, streaming)
    else:
        df = read_file(path, InputFormat.Bam, read_options)
        return lazy_scan(df)

`read_bed(path, thread_num=1, chunk_size=8, concurrent_fetches=1, allow_anonymous=True, enable_request_payer=False, max_retries=5, timeout=300, compression_type='auto', streaming=False)` `staticmethod`

Read a BED file into a LazyFrame.

Parameters:

Name	Type	Description	Default
`path`	`str`	The path to the BED file.	required
`thread_num`	`int`	The number of threads to use for reading the BED file. Used only for parallel decompression of BGZF blocks. Works only for local files.	`1`
`chunk_size`	`int`	The size in MB of a chunk when reading from an object store. The default is 8 MB. For large scale operations, it is recommended to increase this value to 64.	`8`
`concurrent_fetches`	`int`	[GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large scale operations, it is recommended to increase this value to 8 or even more.	`1`
`allow_anonymous`	`bool`	[GCS, AWS S3] Whether to allow anonymous access to object storage.	`True`
`enable_request_payer`	`bool`	[AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.	`False`
`max_retries`	`int`	The maximum number of retries for reading the file from object storage.	`5`
`timeout`	`int`	The timeout in seconds for reading the file from object storage.	`300`
`compression_type`	`str`	The compression type of the BED file. If not specified, it will be detected automatically based on the file extension. BGZF compressions is supported ('bgz').	`'auto'`
`streaming`	`bool`	Whether to read the BED file in streaming mode.	`False`

Note

Only BED4 format is supported. It extends the basic BED format (BED3) by adding a name field, resulting in four columns: chromosome, start position, end position, and name. Also unlike other text formats, GZIP compression is not supported.

Example

 cd /tmp
 wget https://webs.iiitd.edu.in/raghava/humcfs/fragile_site_bed.zip -O fragile_site_bed.zip
 unzip fragile_site_bed.zip -x "__MACOSX/*" "*/.DS_Store"

import polars_bio as pb
pb.read_bed("/tmp/fragile_site_bed/chr5_fragile_site.bed").limit(5).collect()

shape: (5, 4)
┌───────┬───────────┬───────────┬───────┐
│ chrom ┆ start     ┆ end       ┆ name  │
│ ---   ┆ ---       ┆ ---       ┆ ---   │
│ str   ┆ u32       ┆ u32       ┆ str   │
╞═══════╪═══════════╪═══════════╪═══════╡
│ chr5  ┆ 28900001  ┆ 42500000  ┆ FRA5A │
│ chr5  ┆ 92300001  ┆ 98200000  ┆ FRA5B │
│ chr5  ┆ 130600001 ┆ 136200000 ┆ FRA5C │
│ chr5  ┆ 92300001  ┆ 93916228  ┆ FRA5D │
│ chr5  ┆ 18400001  ┆ 28900000  ┆ FRA5E │
└───────┴───────────┴───────────┴───────┘

Note

BED reader uses 1-based coordinate system for the start, end.

Source code in polars_bio/io.py

@staticmethod
def read_bed(
    path: str,
    thread_num: int = 1,
    chunk_size: int = 8,
    concurrent_fetches: int = 1,
    allow_anonymous: bool = True,
    enable_request_payer: bool = False,
    max_retries: int = 5,
    timeout: int = 300,
    compression_type: str = "auto",
    streaming: bool = False,
) -> Union[pl.LazyFrame, pl.DataFrame]:
    """
    Read a BED file into a LazyFrame.

    Parameters:
        path: The path to the BED file.
        thread_num: The number of threads to use for reading the BED file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
        chunk_size: The size in MB of a chunk when reading from an object store. The default is 8 MB. For large scale operations, it is recommended to increase this value to 64.
        concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large scale operations, it is recommended to increase this value to 8 or even more.
        allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
        enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
        max_retries:  The maximum number of retries for reading the file from object storage.
        timeout: The timeout in seconds for reading the file from object storage.
        compression_type: The compression type of the BED file. If not specified, it will be detected automatically based on the file extension. BGZF compressions is supported ('bgz').
        streaming: Whether to read the BED file in streaming mode.

    !!! Note
        Only **BED4** format is supported. It extends the basic BED format (BED3) by adding a name field, resulting in four columns: chromosome, start position, end position, and name.
        Also unlike other text formats, **GZIP** compression is not supported.

    !!! Example
        ```shell

         cd /tmp
         wget https://webs.iiitd.edu.in/raghava/humcfs/fragile_site_bed.zip -O fragile_site_bed.zip
         unzip fragile_site_bed.zip -x "__MACOSX/*" "*/.DS_Store"
        ```

        ```python
        import polars_bio as pb
        pb.read_bed("/tmp/fragile_site_bed/chr5_fragile_site.bed").limit(5).collect()
        ```

        ```shell

        shape: (5, 4)
        ┌───────┬───────────┬───────────┬───────┐
        │ chrom ┆ start     ┆ end       ┆ name  │
        │ ---   ┆ ---       ┆ ---       ┆ ---   │
        │ str   ┆ u32       ┆ u32       ┆ str   │
        ╞═══════╪═══════════╪═══════════╪═══════╡
        │ chr5  ┆ 28900001  ┆ 42500000  ┆ FRA5A │
        │ chr5  ┆ 92300001  ┆ 98200000  ┆ FRA5B │
        │ chr5  ┆ 130600001 ┆ 136200000 ┆ FRA5C │
        │ chr5  ┆ 92300001  ┆ 93916228  ┆ FRA5D │
        │ chr5  ┆ 18400001  ┆ 28900000  ┆ FRA5E │
        └───────┴───────────┴───────────┴───────┘
        ```
    !!! note
        BED reader uses **1-based** coordinate system for the `start`, `end`.
    """

    object_storage_options = PyObjectStorageOptions(
        allow_anonymous=allow_anonymous,
        enable_request_payer=enable_request_payer,
        chunk_size=chunk_size,
        concurrent_fetches=concurrent_fetches,
        max_retries=max_retries,
        timeout=timeout,
        compression_type=compression_type,
    )

    bed_read_options = BedReadOptions(
        thread_num=thread_num,
        object_storage_options=object_storage_options,
    )
    read_options = ReadOptions(bed_read_options=bed_read_options)
    if streaming:
        return read_file(path, InputFormat.Bed, read_options, streaming)
    else:
        df = read_file(path, InputFormat.Bed, read_options)
        return lazy_scan(df)

`read_fastq(path, thread_num=1, chunk_size=8, concurrent_fetches=1, allow_anonymous=True, enable_request_payer=False, max_retries=5, timeout=300, compression_type='auto', streaming=False)` `staticmethod`

Read a FASTQ file into a LazyFrame.

Parameters:

Name	Type	Description	Default
`path`	`str`	The path to the FASTQ file.	required
`thread_num`	`int`	The number of threads to use for reading the FASTQ file. Used only for parallel decompression of BGZF blocks. Works only for local files.	`1`
`chunk_size`	`int`	The size in MB of a chunk when reading from an object store. The default is 8 MB. For large scale operations, it is recommended to increase this value to 64.	`8`
`concurrent_fetches`	`int`	[GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large scale operations, it is recommended to increase this value to 8 or even more.	`1`
`allow_anonymous`	`bool`	[GCS, AWS S3] Whether to allow anonymous access to object storage.	`True`
`enable_request_payer`	`bool`	[AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.	`False`
`max_retries`	`int`	The maximum number of retries for reading the file from object storage.	`5`
`timeout`	`int`	The timeout in seconds for reading the file from object storage.	`300`
`compression_type`	`str`	The compression type of the FASTQ file. If not specified, it will be detected automatically based on the file extension. BGZF and GZIP compressions are supported ('bgz', 'gz').	`'auto'`
`streaming`	`bool`	Whether to read the FASTQ file in streaming mode.	`False`

Example

import polars_bio as pb
pb.read_fastq("gs://genomics-public-data/platinum-genomes/fastq/ERR194146.fastq.gz").limit(1).collect()

shape: (1, 4)
┌─────────────────────┬─────────────────────────────────┬─────────────────────────────────┬─────────────────────────────────┐
│ name                ┆ description                     ┆ sequence                        ┆ quality_scores                  │
│ ---                 ┆ ---                             ┆ ---                             ┆ ---                             │
│ str                 ┆ str                             ┆ str                             ┆ str                             │
╞═════════════════════╪═════════════════════════════════╪═════════════════════════════════╪═════════════════════════════════╡
│ ERR194146.812444541 ┆ HSQ1008:141:D0CC8ACXX:2:1204:1… ┆ TGGAAGGTTCTCGAAAAAAATGGAATCGAA… ┆ ?@;DDBDDBHF??FFB@B)1:CD3*:?DFF… │
└─────────────────────┴─────────────────────────────────┴─────────────────────────────────┴─────────────────────────────────┘

Source code in polars_bio/io.py

@staticmethod
def read_fastq(
    path: str,
    thread_num: int = 1,
    chunk_size: int = 8,
    concurrent_fetches: int = 1,
    allow_anonymous: bool = True,
    enable_request_payer: bool = False,
    max_retries: int = 5,
    timeout: int = 300,
    compression_type: str = "auto",
    streaming: bool = False,
) -> Union[pl.LazyFrame, pl.DataFrame]:
    """
    Read a FASTQ file into a LazyFrame.

    Parameters:
        path: The path to the FASTQ file.
        thread_num: The number of threads to use for reading the FASTQ file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
        chunk_size: The size in MB of a chunk when reading from an object store. The default is 8 MB. For large scale operations, it is recommended to increase this value to 64.
        concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large scale operations, it is recommended to increase this value to 8 or even more.
        allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
        enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
        max_retries:  The maximum number of retries for reading the file from object storage.
        timeout: The timeout in seconds for reading the file from object storage.
        compression_type: The compression type of the FASTQ file. If not specified, it will be detected automatically based on the file extension. BGZF and GZIP compressions are supported ('bgz', 'gz').
        streaming: Whether to read the FASTQ file in streaming mode.

    !!! Example

        ```python
        import polars_bio as pb
        pb.read_fastq("gs://genomics-public-data/platinum-genomes/fastq/ERR194146.fastq.gz").limit(1).collect()
        ```
        ```shell
        shape: (1, 4)
        ┌─────────────────────┬─────────────────────────────────┬─────────────────────────────────┬─────────────────────────────────┐
        │ name                ┆ description                     ┆ sequence                        ┆ quality_scores                  │
        │ ---                 ┆ ---                             ┆ ---                             ┆ ---                             │
        │ str                 ┆ str                             ┆ str                             ┆ str                             │
        ╞═════════════════════╪═════════════════════════════════╪═════════════════════════════════╪═════════════════════════════════╡
        │ ERR194146.812444541 ┆ HSQ1008:141:D0CC8ACXX:2:1204:1… ┆ TGGAAGGTTCTCGAAAAAAATGGAATCGAA… ┆ ?@;DDBDDBHF??FFB@B)1:CD3*:?DFF… │
        └─────────────────────┴─────────────────────────────────┴─────────────────────────────────┴─────────────────────────────────┘

        ```
    """

    object_storage_options = PyObjectStorageOptions(
        allow_anonymous=allow_anonymous,
        enable_request_payer=enable_request_payer,
        chunk_size=chunk_size,
        concurrent_fetches=concurrent_fetches,
        max_retries=max_retries,
        timeout=timeout,
        compression_type=compression_type,
    )

    fastq_read_options = FastqReadOptions(
        thread_num=thread_num,
        object_storage_options=object_storage_options,
    )
    read_options = ReadOptions(fastq_read_options=fastq_read_options)
    if streaming:
        return read_file(path, InputFormat.Fastq, read_options, streaming)
    else:
        df = read_file(path, InputFormat.Fastq, read_options)
        return lazy_scan(df)

`read_gff(path, attr_fields=None, thread_num=1, chunk_size=8, concurrent_fetches=1, allow_anonymous=True, enable_request_payer=False, max_retries=5, timeout=300, compression_type='auto', streaming=False)` `staticmethod`

Read a GFF file into a LazyFrame.

Parameters:

Name	Type	Description	Default
`path`	`str`	The path to the GFF file.	required
`attr_fields`	`Union[list[str], None]`	The fields to unnest from the `attributes` column. If not specified, all fields swill be rendered as `attributes` column containing an array of structures `{'tag':'xxx', 'value':'yyy'}`.	`None`
`thread_num`	`int`	The number of threads to use for reading the GFF file. Used only for parallel decompression of BGZF blocks. Works only for local files.	`1`
`chunk_size`	`int`	The size in MB of a chunk when reading from an object store. The default is 8 MB. For large scale operations, it is recommended to increase this value to 64.	`8`
`concurrent_fetches`	`int`	[GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large scale operations, it is recommended to increase this value to 8 or even more.	`1`
`allow_anonymous`	`bool`	[GCS, AWS S3] Whether to allow anonymous access to object storage.	`True`
`enable_request_payer`	`bool`	[AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.	`False`
`max_retries`	`int`	The maximum number of retries for reading the file from object storage.	`5`
`timeout`	`int`	The timeout in seconds for reading the file from object storage.	`300`
`compression_type`	`str`	The compression type of the GFF file. If not specified, it will be detected automatically based on the file extension. BGZF compression is supported ('bgz').	`'auto'`
`streaming`	`bool`	Whether to read the GFF file in streaming mode.	`False`

Example

wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_38/gencode.v38.annotation.gff3.gz -O /tmp/gencode.v38.annotation.gff3.gz

Read a GFF file without unnesting attributes:

import polars_bio as pb
gff_path = "/tmp/gencode.v38.annotation.gff3.gz"
pb.read_gff(gff_path).limit(5).collect()

shape: (5, 9)
┌───────┬───────┬───────┬────────────┬───┬───────┬────────┬───────┬─────────────────────────────────┐
│ chrom ┆ start ┆ end   ┆ type       ┆ … ┆ score ┆ strand ┆ phase ┆ attributes                      │
│ ---   ┆ ---   ┆ ---   ┆ ---        ┆   ┆ ---   ┆ ---    ┆ ---   ┆ ---                             │
│ str   ┆ u32   ┆ u32   ┆ str        ┆   ┆ f32   ┆ str    ┆ u32   ┆ list[struct[2]]                 │
╞═══════╪═══════╪═══════╪════════════╪═══╪═══════╪════════╪═══════╪═════════════════════════════════╡
│ chr1  ┆ 11869 ┆ 14409 ┆ gene       ┆ … ┆ null  ┆ +      ┆ null  ┆ [{"ID","ENSG00000223972.5"}, {… │
│ chr1  ┆ 11869 ┆ 14409 ┆ transcript ┆ … ┆ null  ┆ +      ┆ null  ┆ [{"ID","ENST00000456328.2"}, {… │
│ chr1  ┆ 11869 ┆ 12227 ┆ exon       ┆ … ┆ null  ┆ +      ┆ null  ┆ [{"ID","exon:ENST00000456328.2… │
│ chr1  ┆ 12613 ┆ 12721 ┆ exon       ┆ … ┆ null  ┆ +      ┆ null  ┆ [{"ID","exon:ENST00000456328.2… │
│ chr1  ┆ 13221 ┆ 14409 ┆ exon       ┆ … ┆ null  ┆ +      ┆ null  ┆ [{"ID","exon:ENST00000456328.2… │
└───────┴───────┴───────┴────────────┴───┴───────┴────────┴───────┴─────────────────────────────────┘

Read a GFF file with unnesting attributes:

import polars_bio as pb
gff_path = "/tmp/gencode.v38.annotation.gff3.gz"
pb.read_gff(gff_path, attr_fields=["ID", "havana_transcript"]).limit(5).collect()

shape: (5, 10)
┌───────┬───────┬───────┬────────────┬───┬────────┬───────┬──────────────────────────┬──────────────────────┐
│ chrom ┆ start ┆ end   ┆ type       ┆ … ┆ strand ┆ phase ┆ ID                       ┆ havana_transcript    │
│ ---   ┆ ---   ┆ ---   ┆ ---        ┆   ┆ ---    ┆ ---   ┆ ---                      ┆ ---                  │
│ str   ┆ u32   ┆ u32   ┆ str        ┆   ┆ str    ┆ u32   ┆ str                      ┆ str                  │
╞═══════╪═══════╪═══════╪════════════╪═══╪════════╪═══════╪══════════════════════════╪══════════════════════╡
│ chr1  ┆ 11869 ┆ 14409 ┆ gene       ┆ … ┆ +      ┆ null  ┆ ENSG00000223972.5        ┆ null                 │
│ chr1  ┆ 11869 ┆ 14409 ┆ transcript ┆ … ┆ +      ┆ null  ┆ ENST00000456328.2        ┆ OTTHUMT00000362751.1 │
│ chr1  ┆ 11869 ┆ 12227 ┆ exon       ┆ … ┆ +      ┆ null  ┆ exon:ENST00000456328.2:1 ┆ OTTHUMT00000362751.1 │
│ chr1  ┆ 12613 ┆ 12721 ┆ exon       ┆ … ┆ +      ┆ null  ┆ exon:ENST00000456328.2:2 ┆ OTTHUMT00000362751.1 │
│ chr1  ┆ 13221 ┆ 14409 ┆ exon       ┆ … ┆ +      ┆ null  ┆ exon:ENST00000456328.2:3 ┆ OTTHUMT00000362751.1 │
└───────┴───────┴───────┴────────────┴───┴────────┴───────┴──────────────────────────┴──────────────────────┘

Note

GFF reader uses 1-based coordinate system for the start and end columns.

Source code in polars_bio/io.py

@staticmethod
def read_gff(
    path: str,
    attr_fields: Union[list[str], None] = None,
    thread_num: int = 1,
    chunk_size: int = 8,
    concurrent_fetches: int = 1,
    allow_anonymous: bool = True,
    enable_request_payer: bool = False,
    max_retries: int = 5,
    timeout: int = 300,
    compression_type: str = "auto",
    streaming: bool = False,
) -> Union[pl.LazyFrame, pl.DataFrame]:
    """
    Read a GFF file into a LazyFrame.

    Parameters:
        path: The path to the GFF file.
        attr_fields: The fields to unnest from the `attributes` column. If not specified, all fields swill be rendered as `attributes` column containing an array of structures `{'tag':'xxx', 'value':'yyy'}`.
        thread_num: The number of threads to use for reading the GFF file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
        chunk_size: The size in MB of a chunk when reading from an object store. The default is 8 MB. For large scale operations, it is recommended to increase this value to 64.
        concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large scale operations, it is recommended to increase this value to 8 or even more.
        allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
        enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
        max_retries:  The maximum number of retries for reading the file from object storage.
        timeout: The timeout in seconds for reading the file from object storage.
        compression_type: The compression type of the GFF file. If not specified, it will be detected automatically based on the file extension. BGZF compression is supported ('bgz').
        streaming: Whether to read the GFF file in streaming mode.


    !!! Example
        ```shell
        wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_38/gencode.v38.annotation.gff3.gz -O /tmp/gencode.v38.annotation.gff3.gz
        ```
        Read a GFF file **without** unnesting attributes:
        ```python
        import polars_bio as pb
        gff_path = "/tmp/gencode.v38.annotation.gff3.gz"
        pb.read_gff(gff_path).limit(5).collect()
        ```

        ```shell

        shape: (5, 9)
        ┌───────┬───────┬───────┬────────────┬───┬───────┬────────┬───────┬─────────────────────────────────┐
        │ chrom ┆ start ┆ end   ┆ type       ┆ … ┆ score ┆ strand ┆ phase ┆ attributes                      │
        │ ---   ┆ ---   ┆ ---   ┆ ---        ┆   ┆ ---   ┆ ---    ┆ ---   ┆ ---                             │
        │ str   ┆ u32   ┆ u32   ┆ str        ┆   ┆ f32   ┆ str    ┆ u32   ┆ list[struct[2]]                 │
        ╞═══════╪═══════╪═══════╪════════════╪═══╪═══════╪════════╪═══════╪═════════════════════════════════╡
        │ chr1  ┆ 11869 ┆ 14409 ┆ gene       ┆ … ┆ null  ┆ +      ┆ null  ┆ [{"ID","ENSG00000223972.5"}, {… │
        │ chr1  ┆ 11869 ┆ 14409 ┆ transcript ┆ … ┆ null  ┆ +      ┆ null  ┆ [{"ID","ENST00000456328.2"}, {… │
        │ chr1  ┆ 11869 ┆ 12227 ┆ exon       ┆ … ┆ null  ┆ +      ┆ null  ┆ [{"ID","exon:ENST00000456328.2… │
        │ chr1  ┆ 12613 ┆ 12721 ┆ exon       ┆ … ┆ null  ┆ +      ┆ null  ┆ [{"ID","exon:ENST00000456328.2… │
        │ chr1  ┆ 13221 ┆ 14409 ┆ exon       ┆ … ┆ null  ┆ +      ┆ null  ┆ [{"ID","exon:ENST00000456328.2… │
        └───────┴───────┴───────┴────────────┴───┴───────┴────────┴───────┴─────────────────────────────────┘

        ```

        Read a GFF file **with** unnesting attributes:
        ```python
        import polars_bio as pb
        gff_path = "/tmp/gencode.v38.annotation.gff3.gz"
        pb.read_gff(gff_path, attr_fields=["ID", "havana_transcript"]).limit(5).collect()
        ```
        ```shell

        shape: (5, 10)
        ┌───────┬───────┬───────┬────────────┬───┬────────┬───────┬──────────────────────────┬──────────────────────┐
        │ chrom ┆ start ┆ end   ┆ type       ┆ … ┆ strand ┆ phase ┆ ID                       ┆ havana_transcript    │
        │ ---   ┆ ---   ┆ ---   ┆ ---        ┆   ┆ ---    ┆ ---   ┆ ---                      ┆ ---                  │
        │ str   ┆ u32   ┆ u32   ┆ str        ┆   ┆ str    ┆ u32   ┆ str                      ┆ str                  │
        ╞═══════╪═══════╪═══════╪════════════╪═══╪════════╪═══════╪══════════════════════════╪══════════════════════╡
        │ chr1  ┆ 11869 ┆ 14409 ┆ gene       ┆ … ┆ +      ┆ null  ┆ ENSG00000223972.5        ┆ null                 │
        │ chr1  ┆ 11869 ┆ 14409 ┆ transcript ┆ … ┆ +      ┆ null  ┆ ENST00000456328.2        ┆ OTTHUMT00000362751.1 │
        │ chr1  ┆ 11869 ┆ 12227 ┆ exon       ┆ … ┆ +      ┆ null  ┆ exon:ENST00000456328.2:1 ┆ OTTHUMT00000362751.1 │
        │ chr1  ┆ 12613 ┆ 12721 ┆ exon       ┆ … ┆ +      ┆ null  ┆ exon:ENST00000456328.2:2 ┆ OTTHUMT00000362751.1 │
        │ chr1  ┆ 13221 ┆ 14409 ┆ exon       ┆ … ┆ +      ┆ null  ┆ exon:ENST00000456328.2:3 ┆ OTTHUMT00000362751.1 │
        └───────┴───────┴───────┴────────────┴───┴────────┴───────┴──────────────────────────┴──────────────────────┘
        ```
    !!! note
        GFF reader uses **1-based** coordinate system for the `start` and `end` columns.
    """
    object_storage_options = PyObjectStorageOptions(
        allow_anonymous=allow_anonymous,
        enable_request_payer=enable_request_payer,
        chunk_size=chunk_size,
        concurrent_fetches=concurrent_fetches,
        max_retries=max_retries,
        timeout=timeout,
        compression_type=compression_type,
    )

    gff_read_options = GffReadOptions(
        attr_fields=_cleanse_fields(attr_fields),
        thread_num=thread_num,
        object_storage_options=object_storage_options,
    )
    read_options = ReadOptions(gff_read_options=gff_read_options)
    if streaming:
        return read_file(path, InputFormat.Gff, read_options, streaming)
    else:
        df = read_file(path, InputFormat.Gff, read_options)
        return lazy_scan(df)

`read_table(path, schema=None, **kwargs)` `staticmethod`

Read a tab-delimited (i.e. BED) file into a Polars LazyFrame. Tries to be compatible with Bioframe's read_table but faster and lazy. Schema should follow the Bioframe's schema format.

Parameters:

Name	Type	Description	Default
`path`	`str`	The path to the file.	required
`schema`	`Dict`	Schema should follow the Bioframe's schema format.	`None`

Source code in polars_bio/io.py

@staticmethod
def read_table(path: str, schema: Dict = None, **kwargs) -> pl.LazyFrame:
    """
     Read a tab-delimited (i.e. BED) file into a Polars LazyFrame.
     Tries to be compatible with Bioframe's [read_table](https://bioframe.readthedocs.io/en/latest/guide-io.html)
     but faster and lazy. Schema should follow the Bioframe's schema [format](https://github.com/open2c/bioframe/blob/2b685eebef393c2c9e6220dcf550b3630d87518e/bioframe/io/schemas.py#L174).

    Parameters:
        path: The path to the file.
        schema: Schema should follow the Bioframe's schema [format](https://github.com/open2c/bioframe/blob/2b685eebef393c2c9e6220dcf550b3630d87518e/bioframe/io/schemas.py#L174).


    """
    df = pl.scan_csv(path, separator="\t", has_header=False, **kwargs)
    if schema is not None:
        columns = SCHEMAS[schema]
        if len(columns) != len(df.collect_schema()):
            raise ValueError(
                f"Schema incompatible with the input. Expected {len(columns)} columns in a schema, got {len(df.collect_schema())} in the input data file. Please provide a valid schema."
            )
        for i, c in enumerate(columns):
            df = df.rename({f"column_{i+1}": c})
    return df

`read_vcf(path, info_fields=None, thread_num=1, chunk_size=8, concurrent_fetches=1, allow_anonymous=True, enable_request_payer=False, max_retries=5, timeout=300, compression_type='auto', streaming=False)` `staticmethod`

Read a VCF file into a LazyFrame.

Parameters:

Name	Type	Description	Default
`path`	`str`	The path to the VCF file.	required
`info_fields`	`Union[list[str], None]`	The fields to read from the INFO column.	`None`
`thread_num`	`int`	The number of threads to use for reading the VCF file. Used only for parallel decompression of BGZF blocks. Works only for local files.	`1`
`chunk_size`	`int`	The size in MB of a chunk when reading from an object store. The default is 8 MB. For large scale operations, it is recommended to increase this value to 64.	`8`
`concurrent_fetches`	`int`	[GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large scale operations, it is recommended to increase this value to 8 or even more.	`1`
`allow_anonymous`	`bool`	[GCS, AWS S3] Whether to allow anonymous access to object storage.	`True`
`enable_request_payer`	`bool`	[AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.	`False`
`max_retries`	`int`	The maximum number of retries for reading the file from object storage.	`5`
`timeout`	`int`	The timeout in seconds for reading the file from object storage.	`300`
`compression_type`	`str`	The compression type of the VCF file. If not specified, it will be detected automatically based on the file extension. BGZF compression is supported ('bgz').	`'auto'`
`streaming`	`bool`	Whether to read the VCF file in streaming mode.	`False`

Note

VCF reader uses 1-based coordinate system for the start and end columns.

Source code in polars_bio/io.py

@staticmethod
def read_vcf(
    path: str,
    info_fields: Union[list[str], None] = None,
    thread_num: int = 1,
    chunk_size: int = 8,
    concurrent_fetches: int = 1,
    allow_anonymous: bool = True,
    enable_request_payer: bool = False,
    max_retries: int = 5,
    timeout: int = 300,
    compression_type: str = "auto",
    streaming: bool = False,
) -> Union[pl.LazyFrame, pl.DataFrame]:
    """
    Read a VCF file into a LazyFrame.

    Parameters:
        path: The path to the VCF file.
        info_fields: The fields to read from the INFO column.
        thread_num: The number of threads to use for reading the VCF file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
        chunk_size: The size in MB of a chunk when reading from an object store. The default is 8 MB. For large scale operations, it is recommended to increase this value to 64.
        concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. The default is 1. For large scale operations, it is recommended to increase this value to 8 or even more.
        allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
        enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
        max_retries:  The maximum number of retries for reading the file from object storage.
        timeout: The timeout in seconds for reading the file from object storage.
        compression_type: The compression type of the VCF file. If not specified, it will be detected automatically based on the file extension. BGZF compression is supported ('bgz').
        streaming: Whether to read the VCF file in streaming mode.

    !!! note
        VCF reader uses **1-based** coordinate system for the `start` and `end` columns.
    """
    object_storage_options = PyObjectStorageOptions(
        allow_anonymous=allow_anonymous,
        enable_request_payer=enable_request_payer,
        chunk_size=chunk_size,
        concurrent_fetches=concurrent_fetches,
        max_retries=max_retries,
        timeout=timeout,
        compression_type=compression_type,
    )

    vcf_read_options = VcfReadOptions(
        info_fields=_cleanse_fields(info_fields),
        thread_num=thread_num,
        object_storage_options=object_storage_options,
    )
    read_options = ReadOptions(vcf_read_options=vcf_read_options)
    if streaming:
        return read_file(path, InputFormat.Vcf, read_options, streaming)
    else:
        df = read_file(path, InputFormat.Vcf, read_options)
        return lazy_scan(df)

`data_processing`

Source code in polars_bio/sql.py

class SQL:
    @staticmethod
    def register_vcf(
        path: str,
        name: Union[str, None] = None,
        info_fields: Union[list[str], None] = None,
        thread_num: int = 1,
        chunk_size: int = 64,
        concurrent_fetches: int = 8,
        allow_anonymous: bool = True,
        max_retries: int = 5,
        timeout: int = 300,
        enable_request_payer: bool = False,
        compression_type: str = "auto",
    ) -> None:
        """
        Register a VCF file as a Datafusion table.

        Parameters:
            path: The path to the VCF file.
            name: The name of the table. If *None*, the name of the table will be generated automatically based on the path.
            info_fields: The fields to read from the INFO column.
            thread_num: The number of threads to use for reading the VCF file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
            chunk_size: The size in MB of a chunk when reading from an object store. Default settings are optimized for large scale operations. For small scale (interactive) operations, it is recommended to decrease this value to **8-16**.
            concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. Default settings are optimized for large scale operations. For small scale (interactive) operations, it is recommended to decrease this value to **1-2**.
            allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
            enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
            compression_type: The compression type of the VCF file. If not specified, it will be detected automatically based on the file extension. BGZF compression is supported ('bgz').
            max_retries:  The maximum number of retries for reading the file from object storage.
            timeout: The timeout in seconds for reading the file from object storage.
        !!! note
            VCF reader uses **1-based** coordinate system for the `start` and `end` columns.

        !!! Example
              ```python
              import polars_bio as pb
              pb.register_vcf("/tmp/gnomad.v4.1.sv.sites.vcf.gz")
              ```
             ```shell
             INFO:polars_bio:Table: gnomad_v4_1_sv_sites_gz registered for path: /tmp/gnomad.v4.1.sv.sites.vcf.gz
             ```
        !!! tip
            `chunk_size` and `concurrent_fetches` can be adjusted according to the network bandwidth and the size of the VCF file. As a rule of thumb for large scale operations (reading a whole VCF), it is recommended to the default values.
        """

        object_storage_options = PyObjectStorageOptions(
            allow_anonymous=allow_anonymous,
            enable_request_payer=enable_request_payer,
            chunk_size=chunk_size,
            concurrent_fetches=concurrent_fetches,
            max_retries=max_retries,
            timeout=timeout,
            compression_type=compression_type,
        )

        vcf_read_options = VcfReadOptions(
            info_fields=_cleanse_fields(info_fields),
            thread_num=thread_num,
            object_storage_options=object_storage_options,
        )
        read_options = ReadOptions(vcf_read_options=vcf_read_options)
        py_register_table(ctx, path, name, InputFormat.Vcf, read_options)

    @staticmethod
    def register_gff(
        path: str,
        name: Union[str, None] = None,
        attr_fields: Union[list[str], None] = None,
        thread_num: int = 1,
        chunk_size: int = 64,
        concurrent_fetches: int = 8,
        allow_anonymous: bool = True,
        max_retries: int = 5,
        timeout: int = 300,
        enable_request_payer: bool = False,
        compression_type: str = "auto",
    ) -> None:
        """
        Register a GFF file as a Datafusion table.

        Parameters:
            path: The path to the GFF file.
            name: The name of the table. If *None*, the name of the table will be generated automatically based on the path.
            attr_fields: The fields to unnest from the `attributes` column. If not specified, all fields swill be rendered as `attributes` column containing an array of structures `{'tag':'xxx', 'value':'yyy'}`.
            thread_num: The number of threads to use for reading the GFF file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
            chunk_size: The size in MB of a chunk when reading from an object store. Default settings are optimized for large scale operations. For small scale (interactive) operations, it is recommended to decrease this value to **8-16**.
            concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. Default settings are optimized for large scale operations. For small scale (interactive) operations, it is recommended to decrease this value to **1-2**.
            allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
            enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
            compression_type: The compression type of the GFF file. If not specified, it will be detected automatically based on the file extension. BGZF and GZIP compression is supported ('bgz' and 'gz').
            max_retries:  The maximum number of retries for reading the file from object storage.
            timeout: The timeout in seconds for reading the file from object storage.
        !!! note
            GFF reader uses **1-based** coordinate system for the `start` and `end` columns.

        !!! Example
            ```shell
            wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_38/gencode.v38.annotation.gff3.gz -O /tmp/gencode.v38.annotation.gff3.gz
            ```
            ```python
            import polars_bio as pb
            pb.register_gff("/tmp/gencode.v38.annotation.gff3.gz", "gencode_v38_annotation3_bgz", attr_fields=["ID", "Parent"])
            pb.sql("SELECT `Parent`, count(*) AS cnt FROM gencode_v38_annotation3_bgz GROUP BY `Parent`").limit(5).collect()
            ```
            ```shell

            shape: (5, 2)
            ┌───────────────────┬───────┐
            │ Parent            ┆ cnt   │
            │ ---               ┆ ---   │
            │ str               ┆ i64   │
            ╞═══════════════════╪═══════╡
            │ null              ┆ 60649 │
            │ ENSG00000223972.5 ┆ 2     │
            │ ENST00000456328.2 ┆ 3     │
            │ ENST00000450305.2 ┆ 6     │
            │ ENSG00000227232.5 ┆ 1     │
            └───────────────────┴───────┘

            ```
        !!! tip
            `chunk_size` and `concurrent_fetches` can be adjusted according to the network bandwidth and the size of the GFF file. As a rule of thumb for large scale operations (reading a whole GFF), it is recommended to the default values.
        """

        object_storage_options = PyObjectStorageOptions(
            allow_anonymous=allow_anonymous,
            enable_request_payer=enable_request_payer,
            chunk_size=chunk_size,
            concurrent_fetches=concurrent_fetches,
            max_retries=max_retries,
            timeout=timeout,
            compression_type=compression_type,
        )

        gff_read_options = GffReadOptions(
            attr_fields=_cleanse_fields(attr_fields),
            thread_num=thread_num,
            object_storage_options=object_storage_options,
        )
        read_options = ReadOptions(gff_read_options=gff_read_options)
        py_register_table(ctx, path, name, InputFormat.Gff, read_options)

    @staticmethod
    def register_fastq(
        path: str,
        name: Union[str, None] = None,
        thread_num: int = 1,
        chunk_size: int = 64,
        concurrent_fetches: int = 8,
        allow_anonymous: bool = True,
        max_retries: int = 5,
        timeout: int = 300,
        enable_request_payer: bool = False,
        compression_type: str = "auto",
    ) -> None:
        """
        Register a FASTQ file as a Datafusion table.

        Parameters:
            path: The path to the FASTQ file.
            name: The name of the table. If *None*, the name of the table will be generated automatically based on the path.
            thread_num: The number of threads to use for reading the FASTQ file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
            chunk_size: The size in MB of a chunk when reading from an object store. Default settings are optimized for large scale operations. For small scale (interactive) operations, it is recommended to decrease this value to **8-16**.
            concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. Default settings are optimized for large scale operations. For small scale (interactive) operations, it is recommended to decrease this value to **1-2**.
            allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
            enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
            compression_type: The compression type of the FASTQ file. If not specified, it will be detected automatically based on the file extension. BGZF and GZIP compression is supported ('bgz' and 'gz').
            max_retries:  The maximum number of retries for reading the file from object storage.
            timeout: The timeout in seconds for reading the file from object storage.

        !!! Example
            ```python
              import polars_bio as pb
              pb.register_fastq("gs://genomics-public-data/platinum-genomes/fastq/ERR194146.fastq.gz", "test_fastq")
              pb.sql("SELECT name, description FROM test_fastq WHERE name LIKE 'ERR194146%'").limit(5).collect()
            ```

            ```shell

              shape: (5, 2)
            ┌─────────────────────┬─────────────────────────────────┐
            │ name                ┆ description                     │
            │ ---                 ┆ ---                             │
            │ str                 ┆ str                             │
            ╞═════════════════════╪═════════════════════════════════╡
            │ ERR194146.812444541 ┆ HSQ1008:141:D0CC8ACXX:2:1204:1… │
            │ ERR194146.812444542 ┆ HSQ1008:141:D0CC8ACXX:4:1206:1… │
            │ ERR194146.812444543 ┆ HSQ1008:141:D0CC8ACXX:3:2104:5… │
            │ ERR194146.812444544 ┆ HSQ1008:141:D0CC8ACXX:3:2204:1… │
            │ ERR194146.812444545 ┆ HSQ1008:141:D0CC8ACXX:3:1304:3… │
            └─────────────────────┴─────────────────────────────────┘

            ```


        !!! tip
            `chunk_size` and `concurrent_fetches` can be adjusted according to the network bandwidth and the size of the FASTQ file. As a rule of thumb for large scale operations (reading a whole FASTQ), it is recommended to the default values.
        """

        object_storage_options = PyObjectStorageOptions(
            allow_anonymous=allow_anonymous,
            enable_request_payer=enable_request_payer,
            chunk_size=chunk_size,
            concurrent_fetches=concurrent_fetches,
            max_retries=max_retries,
            timeout=timeout,
            compression_type=compression_type,
        )

        fastq_read_options = FastqReadOptions(
            thread_num=thread_num,
            object_storage_options=object_storage_options,
        )
        read_options = ReadOptions(fastq_read_options=fastq_read_options)
        py_register_table(ctx, path, name, InputFormat.Fastq, read_options)

    @staticmethod
    def register_bed(
        path: str,
        name: Union[str, None] = None,
        thread_num: int = 1,
        chunk_size: int = 64,
        concurrent_fetches: int = 8,
        allow_anonymous: bool = True,
        max_retries: int = 5,
        timeout: int = 300,
        enable_request_payer: bool = False,
        compression_type: str = "auto",
    ) -> None:
        """
        Register a BED file as a Datafusion table.

        Parameters:
            path: The path to the BED file.
            name: The name of the table. If *None*, the name of the table will be generated automatically based on the path.
            thread_num: The number of threads to use for reading the BED file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
            chunk_size: The size in MB of a chunk when reading from an object store. Default settings are optimized for large scale operations. For small scale (interactive) operations, it is recommended to decrease this value to **8-16**.
            concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. Default settings are optimized for large scale operations. For small scale (interactive) operations, it is recommended to decrease this value to **1-2**.
            allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
            enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
            compression_type: The compression type of the BED file. If not specified, it will be detected automatically based on the file extension. BGZF compression is supported ('bgz').
            max_retries:  The maximum number of retries for reading the file from object storage.
            timeout: The timeout in seconds for reading the file from object storage.

        !!! Note
            Only **BED4** format is supported. It extends the basic BED format (BED3) by adding a name field, resulting in four columns: chromosome, start position, end position, and name.
            Also unlike other text formats, **GZIP** compression is not supported.

        !!! Example
            ```shell

             cd /tmp
             wget https://webs.iiitd.edu.in/raghava/humcfs/fragile_site_bed.zip -O fragile_site_bed.zip
             unzip fragile_site_bed.zip -x "__MACOSX/*" "*/.DS_Store"
            ```

            ```python
            import polars_bio as pb
            pb.register_bed("/tmp/fragile_site_bed/chr5_fragile_site.bed", "test_bed")
            b.sql("select * FROM test_bed WHERE name LIKE 'FRA5%'").collect()
            ```

            ```shell

                shape: (8, 4)
                ┌───────┬───────────┬───────────┬───────┐
                │ chrom ┆ start     ┆ end       ┆ name  │
                │ ---   ┆ ---       ┆ ---       ┆ ---   │
                │ str   ┆ u32       ┆ u32       ┆ str   │
                ╞═══════╪═══════════╪═══════════╪═══════╡
                │ chr5  ┆ 28900001  ┆ 42500000  ┆ FRA5A │
                │ chr5  ┆ 92300001  ┆ 98200000  ┆ FRA5B │
                │ chr5  ┆ 130600001 ┆ 136200000 ┆ FRA5C │
                │ chr5  ┆ 92300001  ┆ 93916228  ┆ FRA5D │
                │ chr5  ┆ 18400001  ┆ 28900000  ┆ FRA5E │
                │ chr5  ┆ 98200001  ┆ 109600000 ┆ FRA5F │
                │ chr5  ┆ 168500001 ┆ 180915260 ┆ FRA5G │
                │ chr5  ┆ 50500001  ┆ 63000000  ┆ FRA5H │
                └───────┴───────────┴───────────┴───────┘
            ```


        !!! tip
            `chunk_size` and `concurrent_fetches` can be adjusted according to the network bandwidth and the size of the BED file. As a rule of thumb for large scale operations (reading a whole BED), it is recommended to the default values.
        """

        object_storage_options = PyObjectStorageOptions(
            allow_anonymous=allow_anonymous,
            enable_request_payer=enable_request_payer,
            chunk_size=chunk_size,
            concurrent_fetches=concurrent_fetches,
            max_retries=max_retries,
            timeout=timeout,
            compression_type=compression_type,
        )

        bed_read_options = BedReadOptions(
            thread_num=thread_num,
            object_storage_options=object_storage_options,
        )
        read_options = ReadOptions(bed_read_options=bed_read_options)
        py_register_table(ctx, path, name, InputFormat.Bed, read_options)

    @staticmethod
    def register_view(name: str, query: str) -> None:
        """
        Register a query as a Datafusion view. This view can be used in genomic ranges operations,
        such as overlap, nearest, and count_overlaps. It is useful for filtering, transforming, and aggregating data
        prior to the range operation. When combined with the range operation, it can be used to perform complex in a streaming fashion end-to-end.

        Parameters:
            name: The name of the table.
            query: The SQL query.

        !!! Example
              ```python
              import polars_bio as pb
              pb.register_vcf("gs://gcp-public-data--gnomad/release/4.1/vcf/exomes/gnomad.exomes.v4.1.sites.chr21.vcf.bgz", "gnomad_sv")
              pb.register_view("v_gnomad_sv", "SELECT replace(chrom,'chr', '') AS chrom, start, end FROM gnomad_sv")
              pb.sql("SELECT * FROM v_gnomad_sv").limit(5).collect()
              ```
              ```shell
                shape: (5, 3)
                ┌───────┬─────────┬─────────┐
                │ chrom ┆ start   ┆ end     │
                │ ---   ┆ ---     ┆ ---     │
                │ str   ┆ u32     ┆ u32     │
                ╞═══════╪═════════╪═════════╡
                │ 21    ┆ 5031905 ┆ 5031905 │
                │ 21    ┆ 5031905 ┆ 5031905 │
                │ 21    ┆ 5031909 ┆ 5031909 │
                │ 21    ┆ 5031911 ┆ 5031911 │
                │ 21    ┆ 5031911 ┆ 5031911 │
                └───────┴─────────┴─────────┘
              ```
        """
        py_register_view(ctx, name, query)

    @staticmethod
    def register_bam(
        path: str,
        name: Union[str, None] = None,
        thread_num: int = 1,
        chunk_size: int = 64,
        concurrent_fetches: int = 8,
        allow_anonymous: bool = True,
        max_retries: int = 5,
        timeout: int = 300,
        enable_request_payer: bool = False,
    ) -> None:
        """
        Register a BAM file as a Datafusion table.

        Parameters:
            path: The path to the BAM file.
            name: The name of the table. If *None*, the name of the table will be generated automatically based on the path.
            thread_num: The number of threads to use for reading the BAM file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
            chunk_size: The size in MB of a chunk when reading from an object store. Default settings are optimized for large scale operations. For small scale (interactive) operations, it is recommended to decrease this value to **8-16**.
            concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. Default settings are optimized for large scale operations. For small scale (interactive) operations, it is recommended to decrease this value to **1-2**.
            allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
            enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
            max_retries:  The maximum number of retries for reading the file from object storage.
            timeout: The timeout in seconds for reading the file from object storage.
        !!! note
            BAM reader uses **1-based** coordinate system for the `start`, `end`, `mate_start`, `mate_end` columns.

        !!! Example

            ```python
            import polars_bio as pb
            pb.register_bam("gs://genomics-public-data/1000-genomes/bam/HG00096.mapped.ILLUMINA.bwa.GBR.low_coverage.20120522.bam", "HG00096_bam", concurrent_fetches=1, chunk_size=8)
            pb.sql("SELECT chrom, flags FROM HG00096_bam").limit(5).collect()
            ```
            ```shell

                shape: (5, 2)
                ┌───────┬───────┐
                │ chrom ┆ flags │
                │ ---   ┆ ---   │
                │ str   ┆ u32   │
                ╞═══════╪═══════╡
                │ chr1  ┆ 163   │
                │ chr1  ┆ 163   │
                │ chr1  ┆ 99    │
                │ chr1  ┆ 99    │
                │ chr1  ┆ 99    │
                └───────┴───────┘
            ```
        !!! tip
            `chunk_size` and `concurrent_fetches` can be adjusted according to the network bandwidth and the size of the BAM file. As a rule of thumb for large scale operations (reading a whole BAM), it is recommended keep the default values.
            For more interactive inspecting a schema, it is recommended to decrease `chunk_size` to **8-16** and `concurrent_fetches` to **1-2**.
        """

        object_storage_options = PyObjectStorageOptions(
            allow_anonymous=allow_anonymous,
            enable_request_payer=enable_request_payer,
            chunk_size=chunk_size,
            concurrent_fetches=concurrent_fetches,
            max_retries=max_retries,
            timeout=timeout,
            compression_type="auto",
        )

        bam_read_options = BamReadOptions(
            thread_num=thread_num,
            object_storage_options=object_storage_options,
        )
        read_options = ReadOptions(bam_read_options=bam_read_options)
        py_register_table(ctx, path, name, InputFormat.Bam, read_options)

    @staticmethod
    def sql(query: str, streaming: bool = False) -> pl.LazyFrame:
        """
        Execute a SQL query on the registered tables.

        Parameters:
            query: The SQL query.
            streaming: Whether to execute the query in streaming mode.

        !!! Example
              ```python
              import polars_bio as pb
              pb.register_vcf("/tmp/gnomad.v4.1.sv.sites.vcf.gz", "gnomad_v4_1_sv")
              pb.sql("SELECT * FROM gnomad_v4_1_sv LIMIT 5").collect()
              ```
        """
        if streaming:
            return stream_wrapper(py_scan_sql(ctx, query))
        else:
            df = py_read_sql(ctx, query)
            return lazy_scan(df)

`register_bam(path, name=None, thread_num=1, chunk_size=64, concurrent_fetches=8, allow_anonymous=True, max_retries=5, timeout=300, enable_request_payer=False)` `staticmethod`

Register a BAM file as a Datafusion table.

Parameters:

Name	Type	Description	Default
`path`	`str`	The path to the BAM file.	required
`name`	`Union[str, None]`	The name of the table. If None, the name of the table will be generated automatically based on the path.	`None`
`thread_num`	`int`	The number of threads to use for reading the BAM file. Used only for parallel decompression of BGZF blocks. Works only for local files.	`1`
`chunk_size`	`int`	The size in MB of a chunk when reading from an object store. Default settings are optimized for large scale operations. For small scale (interactive) operations, it is recommended to decrease this value to 8-16.	`64`
`concurrent_fetches`	`int`	[GCS] The number of concurrent fetches when reading from an object store. Default settings are optimized for large scale operations. For small scale (interactive) operations, it is recommended to decrease this value to 1-2.	`8`
`allow_anonymous`	`bool`	[GCS, AWS S3] Whether to allow anonymous access to object storage.	`True`
`enable_request_payer`	`bool`	[AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.	`False`
`max_retries`	`int`	The maximum number of retries for reading the file from object storage.	`5`
`timeout`	`int`	The timeout in seconds for reading the file from object storage.	`300`

Note

BAM reader uses 1-based coordinate system for the start, end, mate_start, mate_end columns.

Example

import polars_bio as pb
pb.register_bam("gs://genomics-public-data/1000-genomes/bam/HG00096.mapped.ILLUMINA.bwa.GBR.low_coverage.20120522.bam", "HG00096_bam", concurrent_fetches=1, chunk_size=8)
pb.sql("SELECT chrom, flags FROM HG00096_bam").limit(5).collect()

    shape: (5, 2)
    ┌───────┬───────┐
    │ chrom ┆ flags │
    │ ---   ┆ ---   │
    │ str   ┆ u32   │
    ╞═══════╪═══════╡
    │ chr1  ┆ 163   │
    │ chr1  ┆ 163   │
    │ chr1  ┆ 99    │
    │ chr1  ┆ 99    │
    │ chr1  ┆ 99    │
    └───────┴───────┘

Tip

chunk_size and concurrent_fetches can be adjusted according to the network bandwidth and the size of the BAM file. As a rule of thumb for large scale operations (reading a whole BAM), it is recommended keep the default values. For more interactive inspecting a schema, it is recommended to decrease chunk_size to 8-16 and concurrent_fetches to 1-2.

Source code in polars_bio/sql.py

@staticmethod
def register_bam(
    path: str,
    name: Union[str, None] = None,
    thread_num: int = 1,
    chunk_size: int = 64,
    concurrent_fetches: int = 8,
    allow_anonymous: bool = True,
    max_retries: int = 5,
    timeout: int = 300,
    enable_request_payer: bool = False,
) -> None:
    """
    Register a BAM file as a Datafusion table.

    Parameters:
        path: The path to the BAM file.
        name: The name of the table. If *None*, the name of the table will be generated automatically based on the path.
        thread_num: The number of threads to use for reading the BAM file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
        chunk_size: The size in MB of a chunk when reading from an object store. Default settings are optimized for large scale operations. For small scale (interactive) operations, it is recommended to decrease this value to **8-16**.
        concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. Default settings are optimized for large scale operations. For small scale (interactive) operations, it is recommended to decrease this value to **1-2**.
        allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
        enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
        max_retries:  The maximum number of retries for reading the file from object storage.
        timeout: The timeout in seconds for reading the file from object storage.
    !!! note
        BAM reader uses **1-based** coordinate system for the `start`, `end`, `mate_start`, `mate_end` columns.

    !!! Example

        ```python
        import polars_bio as pb
        pb.register_bam("gs://genomics-public-data/1000-genomes/bam/HG00096.mapped.ILLUMINA.bwa.GBR.low_coverage.20120522.bam", "HG00096_bam", concurrent_fetches=1, chunk_size=8)
        pb.sql("SELECT chrom, flags FROM HG00096_bam").limit(5).collect()
        ```
        ```shell

            shape: (5, 2)
            ┌───────┬───────┐
            │ chrom ┆ flags │
            │ ---   ┆ ---   │
            │ str   ┆ u32   │
            ╞═══════╪═══════╡
            │ chr1  ┆ 163   │
            │ chr1  ┆ 163   │
            │ chr1  ┆ 99    │
            │ chr1  ┆ 99    │
            │ chr1  ┆ 99    │
            └───────┴───────┘
        ```
    !!! tip
        `chunk_size` and `concurrent_fetches` can be adjusted according to the network bandwidth and the size of the BAM file. As a rule of thumb for large scale operations (reading a whole BAM), it is recommended keep the default values.
        For more interactive inspecting a schema, it is recommended to decrease `chunk_size` to **8-16** and `concurrent_fetches` to **1-2**.
    """

    object_storage_options = PyObjectStorageOptions(
        allow_anonymous=allow_anonymous,
        enable_request_payer=enable_request_payer,
        chunk_size=chunk_size,
        concurrent_fetches=concurrent_fetches,
        max_retries=max_retries,
        timeout=timeout,
        compression_type="auto",
    )

    bam_read_options = BamReadOptions(
        thread_num=thread_num,
        object_storage_options=object_storage_options,
    )
    read_options = ReadOptions(bam_read_options=bam_read_options)
    py_register_table(ctx, path, name, InputFormat.Bam, read_options)

`register_bed(path, name=None, thread_num=1, chunk_size=64, concurrent_fetches=8, allow_anonymous=True, max_retries=5, timeout=300, enable_request_payer=False, compression_type='auto')` `staticmethod`

Register a BED file as a Datafusion table.

Parameters:

Name	Type	Description	Default
`path`	`str`	The path to the BED file.	required
`name`	`Union[str, None]`	The name of the table. If None, the name of the table will be generated automatically based on the path.	`None`
`thread_num`	`int`	The number of threads to use for reading the BED file. Used only for parallel decompression of BGZF blocks. Works only for local files.	`1`
`chunk_size`	`int`	The size in MB of a chunk when reading from an object store. Default settings are optimized for large scale operations. For small scale (interactive) operations, it is recommended to decrease this value to 8-16.	`64`
`concurrent_fetches`	`int`	[GCS] The number of concurrent fetches when reading from an object store. Default settings are optimized for large scale operations. For small scale (interactive) operations, it is recommended to decrease this value to 1-2.	`8`
`allow_anonymous`	`bool`	[GCS, AWS S3] Whether to allow anonymous access to object storage.	`True`
`enable_request_payer`	`bool`	[AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.	`False`
`compression_type`	`str`	The compression type of the BED file. If not specified, it will be detected automatically based on the file extension. BGZF compression is supported ('bgz').	`'auto'`
`max_retries`	`int`	The maximum number of retries for reading the file from object storage.	`5`
`timeout`	`int`	The timeout in seconds for reading the file from object storage.	`300`

Note

Only BED4 format is supported. It extends the basic BED format (BED3) by adding a name field, resulting in four columns: chromosome, start position, end position, and name. Also unlike other text formats, GZIP compression is not supported.

Example

 cd /tmp
 wget https://webs.iiitd.edu.in/raghava/humcfs/fragile_site_bed.zip -O fragile_site_bed.zip
 unzip fragile_site_bed.zip -x "__MACOSX/*" "*/.DS_Store"

import polars_bio as pb
pb.register_bed("/tmp/fragile_site_bed/chr5_fragile_site.bed", "test_bed")
b.sql("select * FROM test_bed WHERE name LIKE 'FRA5%'").collect()

    shape: (8, 4)
    ┌───────┬───────────┬───────────┬───────┐
    │ chrom ┆ start     ┆ end       ┆ name  │
    │ ---   ┆ ---       ┆ ---       ┆ ---   │
    │ str   ┆ u32       ┆ u32       ┆ str   │
    ╞═══════╪═══════════╪═══════════╪═══════╡
    │ chr5  ┆ 28900001  ┆ 42500000  ┆ FRA5A │
    │ chr5  ┆ 92300001  ┆ 98200000  ┆ FRA5B │
    │ chr5  ┆ 130600001 ┆ 136200000 ┆ FRA5C │
    │ chr5  ┆ 92300001  ┆ 93916228  ┆ FRA5D │
    │ chr5  ┆ 18400001  ┆ 28900000  ┆ FRA5E │
    │ chr5  ┆ 98200001  ┆ 109600000 ┆ FRA5F │
    │ chr5  ┆ 168500001 ┆ 180915260 ┆ FRA5G │
    │ chr5  ┆ 50500001  ┆ 63000000  ┆ FRA5H │
    └───────┴───────────┴───────────┴───────┘

Tip

chunk_size and concurrent_fetches can be adjusted according to the network bandwidth and the size of the BED file. As a rule of thumb for large scale operations (reading a whole BED), it is recommended to the default values.

Source code in polars_bio/sql.py

@staticmethod
def register_bed(
    path: str,
    name: Union[str, None] = None,
    thread_num: int = 1,
    chunk_size: int = 64,
    concurrent_fetches: int = 8,
    allow_anonymous: bool = True,
    max_retries: int = 5,
    timeout: int = 300,
    enable_request_payer: bool = False,
    compression_type: str = "auto",
) -> None:
    """
    Register a BED file as a Datafusion table.

    Parameters:
        path: The path to the BED file.
        name: The name of the table. If *None*, the name of the table will be generated automatically based on the path.
        thread_num: The number of threads to use for reading the BED file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
        chunk_size: The size in MB of a chunk when reading from an object store. Default settings are optimized for large scale operations. For small scale (interactive) operations, it is recommended to decrease this value to **8-16**.
        concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. Default settings are optimized for large scale operations. For small scale (interactive) operations, it is recommended to decrease this value to **1-2**.
        allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
        enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
        compression_type: The compression type of the BED file. If not specified, it will be detected automatically based on the file extension. BGZF compression is supported ('bgz').
        max_retries:  The maximum number of retries for reading the file from object storage.
        timeout: The timeout in seconds for reading the file from object storage.

    !!! Note
        Only **BED4** format is supported. It extends the basic BED format (BED3) by adding a name field, resulting in four columns: chromosome, start position, end position, and name.
        Also unlike other text formats, **GZIP** compression is not supported.

    !!! Example
        ```shell

         cd /tmp
         wget https://webs.iiitd.edu.in/raghava/humcfs/fragile_site_bed.zip -O fragile_site_bed.zip
         unzip fragile_site_bed.zip -x "__MACOSX/*" "*/.DS_Store"
        ```

        ```python
        import polars_bio as pb
        pb.register_bed("/tmp/fragile_site_bed/chr5_fragile_site.bed", "test_bed")
        b.sql("select * FROM test_bed WHERE name LIKE 'FRA5%'").collect()
        ```

        ```shell

            shape: (8, 4)
            ┌───────┬───────────┬───────────┬───────┐
            │ chrom ┆ start     ┆ end       ┆ name  │
            │ ---   ┆ ---       ┆ ---       ┆ ---   │
            │ str   ┆ u32       ┆ u32       ┆ str   │
            ╞═══════╪═══════════╪═══════════╪═══════╡
            │ chr5  ┆ 28900001  ┆ 42500000  ┆ FRA5A │
            │ chr5  ┆ 92300001  ┆ 98200000  ┆ FRA5B │
            │ chr5  ┆ 130600001 ┆ 136200000 ┆ FRA5C │
            │ chr5  ┆ 92300001  ┆ 93916228  ┆ FRA5D │
            │ chr5  ┆ 18400001  ┆ 28900000  ┆ FRA5E │
            │ chr5  ┆ 98200001  ┆ 109600000 ┆ FRA5F │
            │ chr5  ┆ 168500001 ┆ 180915260 ┆ FRA5G │
            │ chr5  ┆ 50500001  ┆ 63000000  ┆ FRA5H │
            └───────┴───────────┴───────────┴───────┘
        ```


    !!! tip
        `chunk_size` and `concurrent_fetches` can be adjusted according to the network bandwidth and the size of the BED file. As a rule of thumb for large scale operations (reading a whole BED), it is recommended to the default values.
    """

    object_storage_options = PyObjectStorageOptions(
        allow_anonymous=allow_anonymous,
        enable_request_payer=enable_request_payer,
        chunk_size=chunk_size,
        concurrent_fetches=concurrent_fetches,
        max_retries=max_retries,
        timeout=timeout,
        compression_type=compression_type,
    )

    bed_read_options = BedReadOptions(
        thread_num=thread_num,
        object_storage_options=object_storage_options,
    )
    read_options = ReadOptions(bed_read_options=bed_read_options)
    py_register_table(ctx, path, name, InputFormat.Bed, read_options)

`register_fastq(path, name=None, thread_num=1, chunk_size=64, concurrent_fetches=8, allow_anonymous=True, max_retries=5, timeout=300, enable_request_payer=False, compression_type='auto')` `staticmethod`

Register a FASTQ file as a Datafusion table.

Parameters:

Name	Type	Description	Default
`path`	`str`	The path to the FASTQ file.	required
`name`	`Union[str, None]`	The name of the table. If None, the name of the table will be generated automatically based on the path.	`None`
`thread_num`	`int`	The number of threads to use for reading the FASTQ file. Used only for parallel decompression of BGZF blocks. Works only for local files.	`1`
`chunk_size`	`int`	The size in MB of a chunk when reading from an object store. Default settings are optimized for large scale operations. For small scale (interactive) operations, it is recommended to decrease this value to 8-16.	`64`
`concurrent_fetches`	`int`	[GCS] The number of concurrent fetches when reading from an object store. Default settings are optimized for large scale operations. For small scale (interactive) operations, it is recommended to decrease this value to 1-2.	`8`
`allow_anonymous`	`bool`	[GCS, AWS S3] Whether to allow anonymous access to object storage.	`True`
`enable_request_payer`	`bool`	[AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.	`False`
`compression_type`	`str`	The compression type of the FASTQ file. If not specified, it will be detected automatically based on the file extension. BGZF and GZIP compression is supported ('bgz' and 'gz').	`'auto'`
`max_retries`	`int`	The maximum number of retries for reading the file from object storage.	`5`
`timeout`	`int`	The timeout in seconds for reading the file from object storage.	`300`

Example

  import polars_bio as pb
  pb.register_fastq("gs://genomics-public-data/platinum-genomes/fastq/ERR194146.fastq.gz", "test_fastq")
  pb.sql("SELECT name, description FROM test_fastq WHERE name LIKE 'ERR194146%'").limit(5).collect()

  shape: (5, 2)
┌─────────────────────┬─────────────────────────────────┐
│ name                ┆ description                     │
│ ---                 ┆ ---                             │
│ str                 ┆ str                             │
╞═════════════════════╪═════════════════════════════════╡
│ ERR194146.812444541 ┆ HSQ1008:141:D0CC8ACXX:2:1204:1… │
│ ERR194146.812444542 ┆ HSQ1008:141:D0CC8ACXX:4:1206:1… │
│ ERR194146.812444543 ┆ HSQ1008:141:D0CC8ACXX:3:2104:5… │
│ ERR194146.812444544 ┆ HSQ1008:141:D0CC8ACXX:3:2204:1… │
│ ERR194146.812444545 ┆ HSQ1008:141:D0CC8ACXX:3:1304:3… │
└─────────────────────┴─────────────────────────────────┘

Tip

chunk_size and concurrent_fetches can be adjusted according to the network bandwidth and the size of the FASTQ file. As a rule of thumb for large scale operations (reading a whole FASTQ), it is recommended to the default values.

Source code in polars_bio/sql.py

@staticmethod
def register_fastq(
    path: str,
    name: Union[str, None] = None,
    thread_num: int = 1,
    chunk_size: int = 64,
    concurrent_fetches: int = 8,
    allow_anonymous: bool = True,
    max_retries: int = 5,
    timeout: int = 300,
    enable_request_payer: bool = False,
    compression_type: str = "auto",
) -> None:
    """
    Register a FASTQ file as a Datafusion table.

    Parameters:
        path: The path to the FASTQ file.
        name: The name of the table. If *None*, the name of the table will be generated automatically based on the path.
        thread_num: The number of threads to use for reading the FASTQ file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
        chunk_size: The size in MB of a chunk when reading from an object store. Default settings are optimized for large scale operations. For small scale (interactive) operations, it is recommended to decrease this value to **8-16**.
        concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. Default settings are optimized for large scale operations. For small scale (interactive) operations, it is recommended to decrease this value to **1-2**.
        allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
        enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
        compression_type: The compression type of the FASTQ file. If not specified, it will be detected automatically based on the file extension. BGZF and GZIP compression is supported ('bgz' and 'gz').
        max_retries:  The maximum number of retries for reading the file from object storage.
        timeout: The timeout in seconds for reading the file from object storage.

    !!! Example
        ```python
          import polars_bio as pb
          pb.register_fastq("gs://genomics-public-data/platinum-genomes/fastq/ERR194146.fastq.gz", "test_fastq")
          pb.sql("SELECT name, description FROM test_fastq WHERE name LIKE 'ERR194146%'").limit(5).collect()
        ```

        ```shell

          shape: (5, 2)
        ┌─────────────────────┬─────────────────────────────────┐
        │ name                ┆ description                     │
        │ ---                 ┆ ---                             │
        │ str                 ┆ str                             │
        ╞═════════════════════╪═════════════════════════════════╡
        │ ERR194146.812444541 ┆ HSQ1008:141:D0CC8ACXX:2:1204:1… │
        │ ERR194146.812444542 ┆ HSQ1008:141:D0CC8ACXX:4:1206:1… │
        │ ERR194146.812444543 ┆ HSQ1008:141:D0CC8ACXX:3:2104:5… │
        │ ERR194146.812444544 ┆ HSQ1008:141:D0CC8ACXX:3:2204:1… │
        │ ERR194146.812444545 ┆ HSQ1008:141:D0CC8ACXX:3:1304:3… │
        └─────────────────────┴─────────────────────────────────┘

        ```


    !!! tip
        `chunk_size` and `concurrent_fetches` can be adjusted according to the network bandwidth and the size of the FASTQ file. As a rule of thumb for large scale operations (reading a whole FASTQ), it is recommended to the default values.
    """

    object_storage_options = PyObjectStorageOptions(
        allow_anonymous=allow_anonymous,
        enable_request_payer=enable_request_payer,
        chunk_size=chunk_size,
        concurrent_fetches=concurrent_fetches,
        max_retries=max_retries,
        timeout=timeout,
        compression_type=compression_type,
    )

    fastq_read_options = FastqReadOptions(
        thread_num=thread_num,
        object_storage_options=object_storage_options,
    )
    read_options = ReadOptions(fastq_read_options=fastq_read_options)
    py_register_table(ctx, path, name, InputFormat.Fastq, read_options)

`register_gff(path, name=None, attr_fields=None, thread_num=1, chunk_size=64, concurrent_fetches=8, allow_anonymous=True, max_retries=5, timeout=300, enable_request_payer=False, compression_type='auto')` `staticmethod`

Register a GFF file as a Datafusion table.

Parameters:

Name	Type	Description	Default
`path`	`str`	The path to the GFF file.	required
`name`	`Union[str, None]`	The name of the table. If None, the name of the table will be generated automatically based on the path.	`None`
`attr_fields`	`Union[list[str], None]`	The fields to unnest from the `attributes` column. If not specified, all fields swill be rendered as `attributes` column containing an array of structures `{'tag':'xxx', 'value':'yyy'}`.	`None`
`thread_num`	`int`	The number of threads to use for reading the GFF file. Used only for parallel decompression of BGZF blocks. Works only for local files.	`1`
`chunk_size`	`int`	The size in MB of a chunk when reading from an object store. Default settings are optimized for large scale operations. For small scale (interactive) operations, it is recommended to decrease this value to 8-16.	`64`
`concurrent_fetches`	`int`	[GCS] The number of concurrent fetches when reading from an object store. Default settings are optimized for large scale operations. For small scale (interactive) operations, it is recommended to decrease this value to 1-2.	`8`
`allow_anonymous`	`bool`	[GCS, AWS S3] Whether to allow anonymous access to object storage.	`True`
`enable_request_payer`	`bool`	[AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.	`False`
`compression_type`	`str`	The compression type of the GFF file. If not specified, it will be detected automatically based on the file extension. BGZF and GZIP compression is supported ('bgz' and 'gz').	`'auto'`
`max_retries`	`int`	The maximum number of retries for reading the file from object storage.	`5`
`timeout`	`int`	The timeout in seconds for reading the file from object storage.	`300`

Note

GFF reader uses 1-based coordinate system for the start and end columns.

Example

wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_38/gencode.v38.annotation.gff3.gz -O /tmp/gencode.v38.annotation.gff3.gz

import polars_bio as pb
pb.register_gff("/tmp/gencode.v38.annotation.gff3.gz", "gencode_v38_annotation3_bgz", attr_fields=["ID", "Parent"])
pb.sql("SELECT `Parent`, count(*) AS cnt FROM gencode_v38_annotation3_bgz GROUP BY `Parent`").limit(5).collect()

shape: (5, 2)
┌───────────────────┬───────┐
│ Parent            ┆ cnt   │
│ ---               ┆ ---   │
│ str               ┆ i64   │
╞═══════════════════╪═══════╡
│ null              ┆ 60649 │
│ ENSG00000223972.5 ┆ 2     │
│ ENST00000456328.2 ┆ 3     │
│ ENST00000450305.2 ┆ 6     │
│ ENSG00000227232.5 ┆ 1     │
└───────────────────┴───────┘

Tip

chunk_size and concurrent_fetches can be adjusted according to the network bandwidth and the size of the GFF file. As a rule of thumb for large scale operations (reading a whole GFF), it is recommended to the default values.

Source code in polars_bio/sql.py

@staticmethod
def register_gff(
    path: str,
    name: Union[str, None] = None,
    attr_fields: Union[list[str], None] = None,
    thread_num: int = 1,
    chunk_size: int = 64,
    concurrent_fetches: int = 8,
    allow_anonymous: bool = True,
    max_retries: int = 5,
    timeout: int = 300,
    enable_request_payer: bool = False,
    compression_type: str = "auto",
) -> None:
    """
    Register a GFF file as a Datafusion table.

    Parameters:
        path: The path to the GFF file.
        name: The name of the table. If *None*, the name of the table will be generated automatically based on the path.
        attr_fields: The fields to unnest from the `attributes` column. If not specified, all fields swill be rendered as `attributes` column containing an array of structures `{'tag':'xxx', 'value':'yyy'}`.
        thread_num: The number of threads to use for reading the GFF file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
        chunk_size: The size in MB of a chunk when reading from an object store. Default settings are optimized for large scale operations. For small scale (interactive) operations, it is recommended to decrease this value to **8-16**.
        concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. Default settings are optimized for large scale operations. For small scale (interactive) operations, it is recommended to decrease this value to **1-2**.
        allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
        enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
        compression_type: The compression type of the GFF file. If not specified, it will be detected automatically based on the file extension. BGZF and GZIP compression is supported ('bgz' and 'gz').
        max_retries:  The maximum number of retries for reading the file from object storage.
        timeout: The timeout in seconds for reading the file from object storage.
    !!! note
        GFF reader uses **1-based** coordinate system for the `start` and `end` columns.

    !!! Example
        ```shell
        wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_38/gencode.v38.annotation.gff3.gz -O /tmp/gencode.v38.annotation.gff3.gz
        ```
        ```python
        import polars_bio as pb
        pb.register_gff("/tmp/gencode.v38.annotation.gff3.gz", "gencode_v38_annotation3_bgz", attr_fields=["ID", "Parent"])
        pb.sql("SELECT `Parent`, count(*) AS cnt FROM gencode_v38_annotation3_bgz GROUP BY `Parent`").limit(5).collect()
        ```
        ```shell

        shape: (5, 2)
        ┌───────────────────┬───────┐
        │ Parent            ┆ cnt   │
        │ ---               ┆ ---   │
        │ str               ┆ i64   │
        ╞═══════════════════╪═══════╡
        │ null              ┆ 60649 │
        │ ENSG00000223972.5 ┆ 2     │
        │ ENST00000456328.2 ┆ 3     │
        │ ENST00000450305.2 ┆ 6     │
        │ ENSG00000227232.5 ┆ 1     │
        └───────────────────┴───────┘

        ```
    !!! tip
        `chunk_size` and `concurrent_fetches` can be adjusted according to the network bandwidth and the size of the GFF file. As a rule of thumb for large scale operations (reading a whole GFF), it is recommended to the default values.
    """

    object_storage_options = PyObjectStorageOptions(
        allow_anonymous=allow_anonymous,
        enable_request_payer=enable_request_payer,
        chunk_size=chunk_size,
        concurrent_fetches=concurrent_fetches,
        max_retries=max_retries,
        timeout=timeout,
        compression_type=compression_type,
    )

    gff_read_options = GffReadOptions(
        attr_fields=_cleanse_fields(attr_fields),
        thread_num=thread_num,
        object_storage_options=object_storage_options,
    )
    read_options = ReadOptions(gff_read_options=gff_read_options)
    py_register_table(ctx, path, name, InputFormat.Gff, read_options)

`register_vcf(path, name=None, info_fields=None, thread_num=1, chunk_size=64, concurrent_fetches=8, allow_anonymous=True, max_retries=5, timeout=300, enable_request_payer=False, compression_type='auto')` `staticmethod`

Register a VCF file as a Datafusion table.

Parameters:

Name	Type	Description	Default
`path`	`str`	The path to the VCF file.	required
`name`	`Union[str, None]`	The name of the table. If None, the name of the table will be generated automatically based on the path.	`None`
`info_fields`	`Union[list[str], None]`	The fields to read from the INFO column.	`None`
`thread_num`	`int`	The number of threads to use for reading the VCF file. Used only for parallel decompression of BGZF blocks. Works only for local files.	`1`
`chunk_size`	`int`	The size in MB of a chunk when reading from an object store. Default settings are optimized for large scale operations. For small scale (interactive) operations, it is recommended to decrease this value to 8-16.	`64`
`concurrent_fetches`	`int`	[GCS] The number of concurrent fetches when reading from an object store. Default settings are optimized for large scale operations. For small scale (interactive) operations, it is recommended to decrease this value to 1-2.	`8`
`allow_anonymous`	`bool`	[GCS, AWS S3] Whether to allow anonymous access to object storage.	`True`
`enable_request_payer`	`bool`	[AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.	`False`
`compression_type`	`str`	The compression type of the VCF file. If not specified, it will be detected automatically based on the file extension. BGZF compression is supported ('bgz').	`'auto'`
`max_retries`	`int`	The maximum number of retries for reading the file from object storage.	`5`
`timeout`	`int`	The timeout in seconds for reading the file from object storage.	`300`

Note

VCF reader uses 1-based coordinate system for the start and end columns.

Example

import polars_bio as pb
pb.register_vcf("/tmp/gnomad.v4.1.sv.sites.vcf.gz")

INFO:polars_bio:Table: gnomad_v4_1_sv_sites_gz registered for path: /tmp/gnomad.v4.1.sv.sites.vcf.gz

Tip

chunk_size and concurrent_fetches can be adjusted according to the network bandwidth and the size of the VCF file. As a rule of thumb for large scale operations (reading a whole VCF), it is recommended to the default values.

Source code in polars_bio/sql.py

@staticmethod
def register_vcf(
    path: str,
    name: Union[str, None] = None,
    info_fields: Union[list[str], None] = None,
    thread_num: int = 1,
    chunk_size: int = 64,
    concurrent_fetches: int = 8,
    allow_anonymous: bool = True,
    max_retries: int = 5,
    timeout: int = 300,
    enable_request_payer: bool = False,
    compression_type: str = "auto",
) -> None:
    """
    Register a VCF file as a Datafusion table.

    Parameters:
        path: The path to the VCF file.
        name: The name of the table. If *None*, the name of the table will be generated automatically based on the path.
        info_fields: The fields to read from the INFO column.
        thread_num: The number of threads to use for reading the VCF file. Used **only** for parallel decompression of BGZF blocks. Works only for **local** files.
        chunk_size: The size in MB of a chunk when reading from an object store. Default settings are optimized for large scale operations. For small scale (interactive) operations, it is recommended to decrease this value to **8-16**.
        concurrent_fetches: [GCS] The number of concurrent fetches when reading from an object store. Default settings are optimized for large scale operations. For small scale (interactive) operations, it is recommended to decrease this value to **1-2**.
        allow_anonymous: [GCS, AWS S3] Whether to allow anonymous access to object storage.
        enable_request_payer: [AWS S3] Whether to enable request payer for object storage. This is useful for reading files from AWS S3 buckets that require request payer.
        compression_type: The compression type of the VCF file. If not specified, it will be detected automatically based on the file extension. BGZF compression is supported ('bgz').
        max_retries:  The maximum number of retries for reading the file from object storage.
        timeout: The timeout in seconds for reading the file from object storage.
    !!! note
        VCF reader uses **1-based** coordinate system for the `start` and `end` columns.

    !!! Example
          ```python
          import polars_bio as pb
          pb.register_vcf("/tmp/gnomad.v4.1.sv.sites.vcf.gz")
          ```
         ```shell
         INFO:polars_bio:Table: gnomad_v4_1_sv_sites_gz registered for path: /tmp/gnomad.v4.1.sv.sites.vcf.gz
         ```
    !!! tip
        `chunk_size` and `concurrent_fetches` can be adjusted according to the network bandwidth and the size of the VCF file. As a rule of thumb for large scale operations (reading a whole VCF), it is recommended to the default values.
    """

    object_storage_options = PyObjectStorageOptions(
        allow_anonymous=allow_anonymous,
        enable_request_payer=enable_request_payer,
        chunk_size=chunk_size,
        concurrent_fetches=concurrent_fetches,
        max_retries=max_retries,
        timeout=timeout,
        compression_type=compression_type,
    )

    vcf_read_options = VcfReadOptions(
        info_fields=_cleanse_fields(info_fields),
        thread_num=thread_num,
        object_storage_options=object_storage_options,
    )
    read_options = ReadOptions(vcf_read_options=vcf_read_options)
    py_register_table(ctx, path, name, InputFormat.Vcf, read_options)

`register_view(name, query)` `staticmethod`

Register a query as a Datafusion view. This view can be used in genomic ranges operations, such as overlap, nearest, and count_overlaps. It is useful for filtering, transforming, and aggregating data prior to the range operation. When combined with the range operation, it can be used to perform complex in a streaming fashion end-to-end.

Parameters:

Name	Type	Description	Default
`name`	`str`	The name of the table.	required
`query`	`str`	The SQL query.	required

Example

import polars_bio as pb
pb.register_vcf("gs://gcp-public-data--gnomad/release/4.1/vcf/exomes/gnomad.exomes.v4.1.sites.chr21.vcf.bgz", "gnomad_sv")
pb.register_view("v_gnomad_sv", "SELECT replace(chrom,'chr', '') AS chrom, start, end FROM gnomad_sv")
pb.sql("SELECT * FROM v_gnomad_sv").limit(5).collect()

  shape: (5, 3)
  ┌───────┬─────────┬─────────┐
  │ chrom ┆ start   ┆ end     │
  │ ---   ┆ ---     ┆ ---     │
  │ str   ┆ u32     ┆ u32     │
  ╞═══════╪═════════╪═════════╡
  │ 21    ┆ 5031905 ┆ 5031905 │
  │ 21    ┆ 5031905 ┆ 5031905 │
  │ 21    ┆ 5031909 ┆ 5031909 │
  │ 21    ┆ 5031911 ┆ 5031911 │
  │ 21    ┆ 5031911 ┆ 5031911 │
  └───────┴─────────┴─────────┘

Source code in polars_bio/sql.py

@staticmethod
def register_view(name: str, query: str) -> None:
    """
    Register a query as a Datafusion view. This view can be used in genomic ranges operations,
    such as overlap, nearest, and count_overlaps. It is useful for filtering, transforming, and aggregating data
    prior to the range operation. When combined with the range operation, it can be used to perform complex in a streaming fashion end-to-end.

    Parameters:
        name: The name of the table.
        query: The SQL query.

    !!! Example
          ```python
          import polars_bio as pb
          pb.register_vcf("gs://gcp-public-data--gnomad/release/4.1/vcf/exomes/gnomad.exomes.v4.1.sites.chr21.vcf.bgz", "gnomad_sv")
          pb.register_view("v_gnomad_sv", "SELECT replace(chrom,'chr', '') AS chrom, start, end FROM gnomad_sv")
          pb.sql("SELECT * FROM v_gnomad_sv").limit(5).collect()
          ```
          ```shell
            shape: (5, 3)
            ┌───────┬─────────┬─────────┐
            │ chrom ┆ start   ┆ end     │
            │ ---   ┆ ---     ┆ ---     │
            │ str   ┆ u32     ┆ u32     │
            ╞═══════╪═════════╪═════════╡
            │ 21    ┆ 5031905 ┆ 5031905 │
            │ 21    ┆ 5031905 ┆ 5031905 │
            │ 21    ┆ 5031909 ┆ 5031909 │
            │ 21    ┆ 5031911 ┆ 5031911 │
            │ 21    ┆ 5031911 ┆ 5031911 │
            └───────┴─────────┴─────────┘
          ```
    """
    py_register_view(ctx, name, query)

`sql(query, streaming=False)` `staticmethod`

Execute a SQL query on the registered tables.

Parameters:

Name	Type	Description	Default
`query`	`str`	The SQL query.	required
`streaming`	`bool`	Whether to execute the query in streaming mode.	`False`

Example

import polars_bio as pb
pb.register_vcf("/tmp/gnomad.v4.1.sv.sites.vcf.gz", "gnomad_v4_1_sv")
pb.sql("SELECT * FROM gnomad_v4_1_sv LIMIT 5").collect()

Source code in polars_bio/sql.py

@staticmethod
def sql(query: str, streaming: bool = False) -> pl.LazyFrame:
    """
    Execute a SQL query on the registered tables.

    Parameters:
        query: The SQL query.
        streaming: Whether to execute the query in streaming mode.

    !!! Example
          ```python
          import polars_bio as pb
          pb.register_vcf("/tmp/gnomad.v4.1.sv.sites.vcf.gz", "gnomad_v4_1_sv")
          pb.sql("SELECT * FROM gnomad_v4_1_sv LIMIT 5").collect()
          ```
    """
    if streaming:
        return stream_wrapper(py_scan_sql(ctx, query))
    else:
        df = py_read_sql(ctx, query)
        return lazy_scan(df)

`range_operations`

Source code in polars_bio/range_op.py

class IntervalOperations:

    @staticmethod
    def overlap(
        df1: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
        df2: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
        use_zero_based: bool = False,
        suffixes: tuple[str, str] = ("_1", "_2"),
        on_cols: Union[list[str], None] = None,
        cols1: Union[list[str], None] = ["chrom", "start", "end"],
        cols2: Union[list[str], None] = ["chrom", "start", "end"],
        algorithm: str = "Coitrees",
        output_type: str = "polars.LazyFrame",
        streaming: bool = False,
        read_options1: Union[ReadOptions, None] = None,
        read_options2: Union[ReadOptions, None] = None,
    ) -> Union[pl.LazyFrame, pl.DataFrame, pd.DataFrame, datafusion.DataFrame]:
        """
        Find pairs of overlapping genomic intervals.
        Bioframe inspired API.

        Parameters:
            df1: Can be a path to a file, a polars DataFrame, or a pandas DataFrame or a registered table (see [register_vcf](api.md#polars_bio.register_vcf)). CSV with a header, BED and Parquet are supported.
            df2: Can be a path to a file, a polars DataFrame, or a pandas DataFrame or a registered table. CSV with a header, BED  and Parquet are supported.
            use_zero_based: By default **1-based** coordinates system is used, as all input file readers use 1-based coordinates. If enabled, 0-based is used instead and end user is responsible for ensuring that both datasets follow this coordinates system.
            cols1: The names of columns containing the chromosome, start and end of the
                genomic intervals, provided separately for each set.
            cols2:  The names of columns containing the chromosome, start and end of the
                genomic intervals, provided separately for each set.
            suffixes: Suffixes for the columns of the two overlapped sets.
            on_cols: List of additional column names to join on. default is None.
            algorithm: The algorithm to use for the overlap operation. Available options: Coitrees, IntervalTree, ArrayIntervalTree, Lapper
            output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
            streaming: **EXPERIMENTAL** If True, use Polars [streaming](features.md#streaming) engine.
            read_options1: Additional options for reading the input files.
            read_options2: Additional options for reading the input files.

        Returns:
            **polars.LazyFrame** or polars.DataFrame or pandas.DataFrame of the overlapping intervals.

        Note:
            1. The default output format, i.e.  [LazyFrame](https://docs.pola.rs/api/python/stable/reference/lazyframe/index.html), is recommended for large datasets as it supports output streaming and lazy evaluation.
            This enables efficient processing of large datasets without loading the entire output dataset into memory.
            2. Streaming is only supported for polars.LazyFrame output.

        Example:
            ```python
            import polars_bio as pb
            import pandas as pd

            df1 = pd.DataFrame([
                ['chr1', 1, 5],
                ['chr1', 3, 8],
                ['chr1', 8, 10],
                ['chr1', 12, 14]],
            columns=['chrom', 'start', 'end']
            )

            df2 = pd.DataFrame(
            [['chr1', 4, 8],
             ['chr1', 10, 11]],
            columns=['chrom', 'start', 'end' ]
            )
            overlapping_intervals = pb.overlap(df1, df2, output_type="pandas.DataFrame")

            overlapping_intervals
                chrom_1         start_1     end_1 chrom_2       start_2  end_2
            0     chr1            1          5     chr1            4          8
            1     chr1            3          8     chr1            4          8

            ```

        Todo:
             Support for on_cols.
        """

        _validate_overlap_input(
            cols1, cols2, on_cols, suffixes, output_type, use_zero_based
        )

        cols1 = DEFAULT_INTERVAL_COLUMNS if cols1 is None else cols1
        cols2 = DEFAULT_INTERVAL_COLUMNS if cols2 is None else cols2
        range_options = RangeOptions(
            range_op=RangeOp.Overlap,
            filter_op=FilterOp.Weak if not use_zero_based else FilterOp.Strict,
            suffixes=suffixes,
            columns_1=cols1,
            columns_2=cols2,
            overlap_alg=algorithm,
            streaming=streaming,
        )

        return range_operation(
            df1, df2, range_options, output_type, ctx, read_options1, read_options2
        )

    @staticmethod
    def nearest(
        df1: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
        df2: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
        use_zero_based: bool = False,
        suffixes: tuple[str, str] = ("_1", "_2"),
        on_cols: Union[list[str], None] = None,
        cols1: Union[list[str], None] = ["chrom", "start", "end"],
        cols2: Union[list[str], None] = ["chrom", "start", "end"],
        output_type: str = "polars.LazyFrame",
        streaming: bool = False,
        read_options: Union[ReadOptions, None] = None,
    ) -> Union[pl.LazyFrame, pl.DataFrame, pd.DataFrame, datafusion.DataFrame]:
        """
        Find pairs of closest genomic intervals.
        Bioframe inspired API.

        Parameters:
            df1: Can be a path to a file, a polars DataFrame, or a pandas DataFrame or a registered table (see [register_vcf](api.md#polars_bio.register_vcf)). CSV with a header, BED and Parquet are supported.
            df2: Can be a path to a file, a polars DataFrame, or a pandas DataFrame or a registered table. CSV with a header, BED  and Parquet are supported.
            use_zero_based: By default **1-based** coordinates system is used, as all input file readers use 1-based coordinates. If enabled, 0-based is used instead and end user is responsible for ensuring that both datasets follow this coordinates system.
            cols1: The names of columns containing the chromosome, start and end of the
                genomic intervals, provided separately for each set.
            cols2:  The names of columns containing the chromosome, start and end of the
                genomic intervals, provided separately for each set.
            suffixes: Suffixes for the columns of the two overlapped sets.
            on_cols: List of additional column names to join on. default is None.
            output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
            streaming: **EXPERIMENTAL** If True, use Polars [streaming](features.md#streaming) engine.
            read_options: Additional options for reading the input files.


        Returns:
            **polars.LazyFrame** or polars.DataFrame or pandas.DataFrame of the overlapping intervals.

        Note:
            The default output format, i.e. [LazyFrame](https://docs.pola.rs/api/python/stable/reference/lazyframe/index.html), is recommended for large datasets as it supports output streaming and lazy evaluation.
            This enables efficient processing of large datasets without loading the entire output dataset into memory.

        Example:

        Todo:
            Support for on_cols.
        """

        _validate_overlap_input(
            cols1, cols2, on_cols, suffixes, output_type, use_zero_based
        )

        cols1 = DEFAULT_INTERVAL_COLUMNS if cols1 is None else cols1
        cols2 = DEFAULT_INTERVAL_COLUMNS if cols2 is None else cols2
        range_options = RangeOptions(
            range_op=RangeOp.Nearest,
            filter_op=FilterOp.Weak if not use_zero_based else FilterOp.Strict,
            suffixes=suffixes,
            columns_1=cols1,
            columns_2=cols2,
            streaming=streaming,
        )
        return range_operation(df1, df2, range_options, output_type, ctx, read_options)

    @staticmethod
    def coverage(
        df1: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
        df2: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
        use_zero_based: bool = False,
        suffixes: tuple[str, str] = ("_1", "_2"),
        on_cols: Union[list[str], None] = None,
        cols1: Union[list[str], None] = ["chrom", "start", "end"],
        cols2: Union[list[str], None] = ["chrom", "start", "end"],
        output_type: str = "polars.LazyFrame",
        streaming: bool = False,
        read_options: Union[ReadOptions, None] = None,
    ) -> Union[pl.LazyFrame, pl.DataFrame, pd.DataFrame, datafusion.DataFrame]:
        """
        Calculate intervals coverage.
        Bioframe inspired API.

        Parameters:
            df1: Can be a path to a file, a polars DataFrame, or a pandas DataFrame or a registered table (see [register_vcf](api.md#polars_bio.register_vcf)). CSV with a header, BED and Parquet are supported.
            df2: Can be a path to a file, a polars DataFrame, or a pandas DataFrame or a registered table. CSV with a header, BED  and Parquet are supported.
            use_zero_based: By default **1-based** coordinates system is used, as all input file readers use 1-based coordinates. If enabled, 0-based is used instead and end user is responsible for ensuring that both datasets follow this coordinates system.
            cols1: The names of columns containing the chromosome, start and end of the
                genomic intervals, provided separately for each set.
            cols2:  The names of columns containing the chromosome, start and end of the
                genomic intervals, provided separately for each set.
            suffixes: Suffixes for the columns of the two overlapped sets.
            on_cols: List of additional column names to join on. default is None.
            output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
            streaming: **EXPERIMENTAL** If True, use Polars [streaming](features.md#streaming) engine.
            read_options: Additional options for reading the input files.


        Returns:
            **polars.LazyFrame** or polars.DataFrame or pandas.DataFrame of the overlapping intervals.

        Note:
            The default output format, i.e. [LazyFrame](https://docs.pola.rs/api/python/stable/reference/lazyframe/index.html), is recommended for large datasets as it supports output streaming and lazy evaluation.
            This enables efficient processing of large datasets without loading the entire output dataset into memory.

        Example:

        Todo:
            Support for on_cols.
        """

        _validate_overlap_input(
            cols1,
            cols2,
            on_cols,
            suffixes,
            output_type,
            use_zero_based,
        )

        cols1 = DEFAULT_INTERVAL_COLUMNS if cols1 is None else cols1
        cols2 = DEFAULT_INTERVAL_COLUMNS if cols2 is None else cols2
        range_options = RangeOptions(
            range_op=RangeOp.Coverage,
            filter_op=FilterOp.Weak if not use_zero_based else FilterOp.Strict,
            suffixes=suffixes,
            columns_1=cols1,
            columns_2=cols2,
            streaming=streaming,
        )
        return range_operation(df2, df1, range_options, output_type, ctx, read_options)

    @staticmethod
    def count_overlaps(
        df1: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
        df2: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
        use_zero_based: bool = False,
        suffixes: tuple[str, str] = ("", "_"),
        cols1: Union[list[str], None] = ["chrom", "start", "end"],
        cols2: Union[list[str], None] = ["chrom", "start", "end"],
        on_cols: Union[list[str], None] = None,
        output_type: str = "polars.LazyFrame",
        streaming: bool = False,
        naive_query: bool = True,
    ) -> Union[pl.LazyFrame, pl.DataFrame, pd.DataFrame, datafusion.DataFrame]:
        """
        Count pairs of overlapping genomic intervals.
        Bioframe inspired API.

        Parameters:
            df1: Can be a path to a file, a polars DataFrame, or a pandas DataFrame or a registered table (see [register_vcf](api.md#polars_bio.register_vcf)). CSV with a header, BED and Parquet are supported.
            df2: Can be a path to a file, a polars DataFrame, or a pandas DataFrame or a registered table. CSV with a header, BED  and Parquet are supported.
            use_zero_based: By default **1-based** coordinates system is used, as all input file readers use 1-based coordinates. If enabled, 0-based is used instead and end user is responsible for ensuring that both datasets follow this coordinates system.
            suffixes: Suffixes for the columns of the two overlapped sets.
            cols1: The names of columns containing the chromosome, start and end of the
                genomic intervals, provided separately for each set.
            cols2:  The names of columns containing the chromosome, start and end of the
                genomic intervals, provided separately for each set.
            on_cols: List of additional column names to join on. default is None.
            output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
            naive_query: If True, use naive query for counting overlaps based on overlaps.
            streaming: **EXPERIMENTAL** If True, use Polars [streaming](features.md#streaming) engine.
        Returns:
            **polars.LazyFrame** or polars.DataFrame or pandas.DataFrame of the overlapping intervals.

        Example:
            ```python
            import polars_bio as pb
            import pandas as pd

            df1 = pd.DataFrame([
                ['chr1', 1, 5],
                ['chr1', 3, 8],
                ['chr1', 8, 10],
                ['chr1', 12, 14]],
            columns=['chrom', 'start', 'end']
            )

            df2 = pd.DataFrame(
            [['chr1', 4, 8],
             ['chr1', 10, 11]],
            columns=['chrom', 'start', 'end' ]
            )
            counts = pb.count_overlaps(df1, df2, output_type="pandas.DataFrame")

            counts

            chrom  start  end  count
            0  chr1      1    5      1
            1  chr1      3    8      1
            2  chr1      8   10      0
            3  chr1     12   14      0
            ```

        Todo:
             Support return_input.
        """
        _validate_overlap_input(
            cols1, cols2, on_cols, suffixes, output_type, use_zero_based
        )
        my_ctx = get_py_ctx()
        on_cols = [] if on_cols is None else on_cols
        cols1 = DEFAULT_INTERVAL_COLUMNS if cols1 is None else cols1
        cols2 = DEFAULT_INTERVAL_COLUMNS if cols2 is None else cols2
        if naive_query:
            range_options = RangeOptions(
                range_op=RangeOp.CountOverlapsNaive,
                filter_op=FilterOp.Weak if not use_zero_based else FilterOp.Strict,
                suffixes=suffixes,
                columns_1=cols1,
                columns_2=cols2,
                streaming=streaming,
            )
            return range_operation(df2, df1, range_options, output_type, ctx)
        df1 = read_df_to_datafusion(my_ctx, df1)
        df2 = read_df_to_datafusion(my_ctx, df2)

        curr_cols = set(df1.schema().names) | set(df2.schema().names)
        s1start_s2end = prevent_column_collision("s1starts2end", curr_cols)
        s1end_s2start = prevent_column_collision("s1ends2start", curr_cols)
        contig = prevent_column_collision("contig", curr_cols)
        count = prevent_column_collision("count", curr_cols)
        starts = prevent_column_collision("starts", curr_cols)
        ends = prevent_column_collision("ends", curr_cols)
        is_s1 = prevent_column_collision("is_s1", curr_cols)
        suff, _ = suffixes
        df1, df2 = df2, df1
        df1 = df1.select(
            *(
                [
                    literal(1).alias(is_s1),
                    col(cols1[1]).alias(s1start_s2end),
                    col(cols1[2]).alias(s1end_s2start),
                    col(cols1[0]).alias(contig),
                ]
                + on_cols
            )
        )
        df2 = df2.select(
            *(
                [
                    literal(0).alias(is_s1),
                    col(cols2[2]).alias(s1end_s2start),
                    col(cols2[1]).alias(s1start_s2end),
                    col(cols2[0]).alias(contig),
                ]
                + on_cols
            )
        )

        df = df1.union(df2)

        partitioning = [col(contig)] + [col(c) for c in on_cols]
        df = df.select(
            *(
                [
                    s1start_s2end,
                    s1end_s2start,
                    contig,
                    is_s1,
                    datafusion.functions.sum(col(is_s1))
                    .over(
                        datafusion.expr.Window(
                            partition_by=partitioning,
                            order_by=[
                                col(s1start_s2end).sort(),
                                col(is_s1).sort(ascending=use_zero_based),
                            ],
                        )
                    )
                    .alias(starts),
                    datafusion.functions.sum(col(is_s1))
                    .over(
                        datafusion.expr.Window(
                            partition_by=partitioning,
                            order_by=[
                                col(s1end_s2start).sort(),
                                col(is_s1).sort(ascending=(not use_zero_based)),
                            ],
                        )
                    )
                    .alias(ends),
                ]
                + on_cols
            )
        )
        df = df.filter(col(is_s1) == 0)
        df = df.select(
            *(
                [
                    col(contig).alias(cols1[0] + suff),
                    col(s1end_s2start).alias(cols1[1] + suff),
                    col(s1start_s2end).alias(cols1[2] + suff),
                ]
                + on_cols
                + [(col(starts) - col(ends)).alias(count)]
            )
        )

        return convert_result(df, output_type, streaming)

    @staticmethod
    def merge(
        df: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
        use_zero_based: bool = False,
        min_dist: float = 0,
        cols: Union[list[str], None] = ["chrom", "start", "end"],
        on_cols: Union[list[str], None] = None,
        output_type: str = "polars.LazyFrame",
        streaming: bool = False,
    ) -> Union[pl.LazyFrame, pl.DataFrame, pd.DataFrame, datafusion.DataFrame]:
        """
        Merge overlapping intervals. It is assumed that start < end.


        Parameters:
            df: Can be a path to a file, a polars DataFrame, or a pandas DataFrame. CSV with a header, BED  and Parquet are supported.
            use_zero_based: By default **1-based** coordinates system is used, as all input file readers use 1-based coordinates. If enabled, 0-based is used instead and end user is responsible for ensuring that both datasets follow this coordinates system.
            cols: The names of columns containing the chromosome, start and end of the
                genomic intervals, provided separately for each set.
            on_cols: List of additional column names for clustering. default is None.
            output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
            streaming: **EXPERIMENTAL** If True, use Polars [streaming](features.md#streaming) engine.

        Returns:
            **polars.LazyFrame** or polars.DataFrame or pandas.DataFrame of the overlapping intervals.

        Example:

        Todo:
            Support for on_cols.
        """
        suffixes = ("_1", "_2")
        _validate_overlap_input(
            cols, cols, on_cols, suffixes, output_type, use_zero_based
        )

        my_ctx = get_py_ctx()
        cols = DEFAULT_INTERVAL_COLUMNS if cols is None else cols
        contig = cols[0]
        start = cols[1]
        end = cols[2]

        on_cols = [] if on_cols is None else on_cols
        on_cols = [contig] + on_cols

        df = read_df_to_datafusion(my_ctx, df)
        df_schema = df.schema()
        start_type = df_schema.field(start).type
        end_type = df_schema.field(end).type

        curr_cols = set(df_schema.names)
        start_end = prevent_column_collision("start_end", curr_cols)
        is_start_end = prevent_column_collision("is_start_or_end", curr_cols)
        current_intervals = prevent_column_collision("current_intervals", curr_cols)
        n_intervals = prevent_column_collision("n_intervals", curr_cols)

        end_positions = df.select(
            *(
                [
                    (col(end) + min_dist).alias(start_end),
                    literal(-1).alias(is_start_end),
                ]
                + on_cols
            )
        )
        start_positions = df.select(
            *([col(start).alias(start_end), literal(1).alias(is_start_end)] + on_cols)
        )
        all_positions = start_positions.union(end_positions)
        start_end_type = all_positions.schema().field(start_end).type
        all_positions = all_positions.select(
            *([col(start_end).cast(start_end_type), col(is_start_end)] + on_cols)
        )

        sorting = [
            col(start_end).sort(),
            col(is_start_end).sort(ascending=use_zero_based),
        ]
        all_positions = all_positions.sort(*sorting)

        on_cols_expr = [col(c) for c in on_cols]

        win = datafusion.expr.Window(
            partition_by=on_cols_expr,
            order_by=sorting,
        )
        all_positions = all_positions.select(
            *(
                [
                    start_end,
                    is_start_end,
                    datafusion.functions.sum(col(is_start_end))
                    .over(win)
                    .alias(current_intervals),
                ]
                + on_cols
                + [
                    datafusion.functions.row_number(
                        partition_by=on_cols_expr, order_by=sorting
                    ).alias(n_intervals)
                ]
            )
        )
        all_positions = all_positions.filter(
            ((col(current_intervals) == 0) & (col(is_start_end) == -1))
            | ((col(current_intervals) == 1) & (col(is_start_end) == 1))
        )
        all_positions = all_positions.select(
            *(
                [start_end, is_start_end]
                + on_cols
                + [
                    (
                        (
                            col(n_intervals)
                            - datafusion.functions.lag(
                                col(n_intervals), partition_by=on_cols_expr
                            )
                            + 1
                        )
                        / 2
                    ).alias(n_intervals)
                ]
            )
        )
        result = all_positions.select(
            *(
                [
                    (col(start_end) - min_dist).alias(end),
                    is_start_end,
                    datafusion.functions.lag(
                        col(start_end), partition_by=on_cols_expr
                    ).alias(start),
                ]
                + on_cols
                + [n_intervals]
            )
        )
        result = result.filter(col(is_start_end) == -1)
        result = result.select(
            *(
                [contig, col(start).cast(start_type), col(end).cast(end_type)]
                + on_cols[1:]
                + [n_intervals]
            )
        )

        return convert_result(result, output_type, streaming)

`count_overlaps(df1, df2, use_zero_based=False, suffixes=('', '_'), cols1=['chrom', 'start', 'end'], cols2=['chrom', 'start', 'end'], on_cols=None, output_type='polars.LazyFrame', streaming=False, naive_query=True)` `staticmethod`

Count pairs of overlapping genomic intervals. Bioframe inspired API.

Parameters:

Name	Type	Description	Default
`df1`	`Union[str, DataFrame, LazyFrame, DataFrame]`	Can be a path to a file, a polars DataFrame, or a pandas DataFrame or a registered table (see register_vcf). CSV with a header, BED and Parquet are supported.	required
`df2`	`Union[str, DataFrame, LazyFrame, DataFrame]`	Can be a path to a file, a polars DataFrame, or a pandas DataFrame or a registered table. CSV with a header, BED and Parquet are supported.	required
`use_zero_based`	`bool`	By default 1-based coordinates system is used, as all input file readers use 1-based coordinates. If enabled, 0-based is used instead and end user is responsible for ensuring that both datasets follow this coordinates system.	`False`
`suffixes`	`tuple[str, str]`	Suffixes for the columns of the two overlapped sets.	`('', '_')`
`cols1`	`Union[list[str], None]`	The names of columns containing the chromosome, start and end of the genomic intervals, provided separately for each set.	`['chrom', 'start', 'end']`
`cols2`	`Union[list[str], None]`	The names of columns containing the chromosome, start and end of the genomic intervals, provided separately for each set.	`['chrom', 'start', 'end']`
`on_cols`	`Union[list[str], None]`	List of additional column names to join on. default is None.	`None`
`output_type`	`str`	Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.	`'polars.LazyFrame'`
`naive_query`	`bool`	If True, use naive query for counting overlaps based on overlaps.	`True`
`streaming`	`bool`	EXPERIMENTAL If True, use Polars streaming engine.	`False`

Returns: polars.LazyFrame or polars.DataFrame or pandas.DataFrame of the overlapping intervals.

Example

import polars_bio as pb
import pandas as pd

df1 = pd.DataFrame([
    ['chr1', 1, 5],
    ['chr1', 3, 8],
    ['chr1', 8, 10],
    ['chr1', 12, 14]],
columns=['chrom', 'start', 'end']
)

df2 = pd.DataFrame(
[['chr1', 4, 8],
 ['chr1', 10, 11]],
columns=['chrom', 'start', 'end' ]
)
counts = pb.count_overlaps(df1, df2, output_type="pandas.DataFrame")

counts

chrom  start  end  count
0  chr1      1    5      1
1  chr1      3    8      1
2  chr1      8   10      0
3  chr1     12   14      0

Todo

Support return_input.

Source code in polars_bio/range_op.py

@staticmethod
def count_overlaps(
    df1: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
    df2: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
    use_zero_based: bool = False,
    suffixes: tuple[str, str] = ("", "_"),
    cols1: Union[list[str], None] = ["chrom", "start", "end"],
    cols2: Union[list[str], None] = ["chrom", "start", "end"],
    on_cols: Union[list[str], None] = None,
    output_type: str = "polars.LazyFrame",
    streaming: bool = False,
    naive_query: bool = True,
) -> Union[pl.LazyFrame, pl.DataFrame, pd.DataFrame, datafusion.DataFrame]:
    """
    Count pairs of overlapping genomic intervals.
    Bioframe inspired API.

    Parameters:
        df1: Can be a path to a file, a polars DataFrame, or a pandas DataFrame or a registered table (see [register_vcf](api.md#polars_bio.register_vcf)). CSV with a header, BED and Parquet are supported.
        df2: Can be a path to a file, a polars DataFrame, or a pandas DataFrame or a registered table. CSV with a header, BED  and Parquet are supported.
        use_zero_based: By default **1-based** coordinates system is used, as all input file readers use 1-based coordinates. If enabled, 0-based is used instead and end user is responsible for ensuring that both datasets follow this coordinates system.
        suffixes: Suffixes for the columns of the two overlapped sets.
        cols1: The names of columns containing the chromosome, start and end of the
            genomic intervals, provided separately for each set.
        cols2:  The names of columns containing the chromosome, start and end of the
            genomic intervals, provided separately for each set.
        on_cols: List of additional column names to join on. default is None.
        output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
        naive_query: If True, use naive query for counting overlaps based on overlaps.
        streaming: **EXPERIMENTAL** If True, use Polars [streaming](features.md#streaming) engine.
    Returns:
        **polars.LazyFrame** or polars.DataFrame or pandas.DataFrame of the overlapping intervals.

    Example:
        ```python
        import polars_bio as pb
        import pandas as pd

        df1 = pd.DataFrame([
            ['chr1', 1, 5],
            ['chr1', 3, 8],
            ['chr1', 8, 10],
            ['chr1', 12, 14]],
        columns=['chrom', 'start', 'end']
        )

        df2 = pd.DataFrame(
        [['chr1', 4, 8],
         ['chr1', 10, 11]],
        columns=['chrom', 'start', 'end' ]
        )
        counts = pb.count_overlaps(df1, df2, output_type="pandas.DataFrame")

        counts

        chrom  start  end  count
        0  chr1      1    5      1
        1  chr1      3    8      1
        2  chr1      8   10      0
        3  chr1     12   14      0
        ```

    Todo:
         Support return_input.
    """
    _validate_overlap_input(
        cols1, cols2, on_cols, suffixes, output_type, use_zero_based
    )
    my_ctx = get_py_ctx()
    on_cols = [] if on_cols is None else on_cols
    cols1 = DEFAULT_INTERVAL_COLUMNS if cols1 is None else cols1
    cols2 = DEFAULT_INTERVAL_COLUMNS if cols2 is None else cols2
    if naive_query:
        range_options = RangeOptions(
            range_op=RangeOp.CountOverlapsNaive,
            filter_op=FilterOp.Weak if not use_zero_based else FilterOp.Strict,
            suffixes=suffixes,
            columns_1=cols1,
            columns_2=cols2,
            streaming=streaming,
        )
        return range_operation(df2, df1, range_options, output_type, ctx)
    df1 = read_df_to_datafusion(my_ctx, df1)
    df2 = read_df_to_datafusion(my_ctx, df2)

    curr_cols = set(df1.schema().names) | set(df2.schema().names)
    s1start_s2end = prevent_column_collision("s1starts2end", curr_cols)
    s1end_s2start = prevent_column_collision("s1ends2start", curr_cols)
    contig = prevent_column_collision("contig", curr_cols)
    count = prevent_column_collision("count", curr_cols)
    starts = prevent_column_collision("starts", curr_cols)
    ends = prevent_column_collision("ends", curr_cols)
    is_s1 = prevent_column_collision("is_s1", curr_cols)
    suff, _ = suffixes
    df1, df2 = df2, df1
    df1 = df1.select(
        *(
            [
                literal(1).alias(is_s1),
                col(cols1[1]).alias(s1start_s2end),
                col(cols1[2]).alias(s1end_s2start),
                col(cols1[0]).alias(contig),
            ]
            + on_cols
        )
    )
    df2 = df2.select(
        *(
            [
                literal(0).alias(is_s1),
                col(cols2[2]).alias(s1end_s2start),
                col(cols2[1]).alias(s1start_s2end),
                col(cols2[0]).alias(contig),
            ]
            + on_cols
        )
    )

    df = df1.union(df2)

    partitioning = [col(contig)] + [col(c) for c in on_cols]
    df = df.select(
        *(
            [
                s1start_s2end,
                s1end_s2start,
                contig,
                is_s1,
                datafusion.functions.sum(col(is_s1))
                .over(
                    datafusion.expr.Window(
                        partition_by=partitioning,
                        order_by=[
                            col(s1start_s2end).sort(),
                            col(is_s1).sort(ascending=use_zero_based),
                        ],
                    )
                )
                .alias(starts),
                datafusion.functions.sum(col(is_s1))
                .over(
                    datafusion.expr.Window(
                        partition_by=partitioning,
                        order_by=[
                            col(s1end_s2start).sort(),
                            col(is_s1).sort(ascending=(not use_zero_based)),
                        ],
                    )
                )
                .alias(ends),
            ]
            + on_cols
        )
    )
    df = df.filter(col(is_s1) == 0)
    df = df.select(
        *(
            [
                col(contig).alias(cols1[0] + suff),
                col(s1end_s2start).alias(cols1[1] + suff),
                col(s1start_s2end).alias(cols1[2] + suff),
            ]
            + on_cols
            + [(col(starts) - col(ends)).alias(count)]
        )
    )

    return convert_result(df, output_type, streaming)

`coverage(df1, df2, use_zero_based=False, suffixes=('_1', '_2'), on_cols=None, cols1=['chrom', 'start', 'end'], cols2=['chrom', 'start', 'end'], output_type='polars.LazyFrame', streaming=False, read_options=None)` `staticmethod`

Calculate intervals coverage. Bioframe inspired API.

Parameters:

Name	Type	Description	Default
`df1`	`Union[str, DataFrame, LazyFrame, DataFrame]`	Can be a path to a file, a polars DataFrame, or a pandas DataFrame or a registered table (see register_vcf). CSV with a header, BED and Parquet are supported.	required
`df2`	`Union[str, DataFrame, LazyFrame, DataFrame]`	Can be a path to a file, a polars DataFrame, or a pandas DataFrame or a registered table. CSV with a header, BED and Parquet are supported.	required
`use_zero_based`	`bool`	By default 1-based coordinates system is used, as all input file readers use 1-based coordinates. If enabled, 0-based is used instead and end user is responsible for ensuring that both datasets follow this coordinates system.	`False`
`cols1`	`Union[list[str], None]`	The names of columns containing the chromosome, start and end of the genomic intervals, provided separately for each set.	`['chrom', 'start', 'end']`
`cols2`	`Union[list[str], None]`	The names of columns containing the chromosome, start and end of the genomic intervals, provided separately for each set.	`['chrom', 'start', 'end']`
`suffixes`	`tuple[str, str]`	Suffixes for the columns of the two overlapped sets.	`('_1', '_2')`
`on_cols`	`Union[list[str], None]`	List of additional column names to join on. default is None.	`None`
`output_type`	`str`	Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.	`'polars.LazyFrame'`
`streaming`	`bool`	EXPERIMENTAL If True, use Polars streaming engine.	`False`
`read_options`	`Union[ReadOptions, None]`	Additional options for reading the input files.	`None`

Returns:

Type	Description
`Union[LazyFrame, DataFrame, DataFrame, DataFrame]`	polars.LazyFrame or polars.DataFrame or pandas.DataFrame of the overlapping intervals.

Note

The default output format, i.e. LazyFrame, is recommended for large datasets as it supports output streaming and lazy evaluation. This enables efficient processing of large datasets without loading the entire output dataset into memory.

Example:

Todo

Support for on_cols.

Source code in polars_bio/range_op.py

@staticmethod
def coverage(
    df1: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
    df2: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
    use_zero_based: bool = False,
    suffixes: tuple[str, str] = ("_1", "_2"),
    on_cols: Union[list[str], None] = None,
    cols1: Union[list[str], None] = ["chrom", "start", "end"],
    cols2: Union[list[str], None] = ["chrom", "start", "end"],
    output_type: str = "polars.LazyFrame",
    streaming: bool = False,
    read_options: Union[ReadOptions, None] = None,
) -> Union[pl.LazyFrame, pl.DataFrame, pd.DataFrame, datafusion.DataFrame]:
    """
    Calculate intervals coverage.
    Bioframe inspired API.

    Parameters:
        df1: Can be a path to a file, a polars DataFrame, or a pandas DataFrame or a registered table (see [register_vcf](api.md#polars_bio.register_vcf)). CSV with a header, BED and Parquet are supported.
        df2: Can be a path to a file, a polars DataFrame, or a pandas DataFrame or a registered table. CSV with a header, BED  and Parquet are supported.
        use_zero_based: By default **1-based** coordinates system is used, as all input file readers use 1-based coordinates. If enabled, 0-based is used instead and end user is responsible for ensuring that both datasets follow this coordinates system.
        cols1: The names of columns containing the chromosome, start and end of the
            genomic intervals, provided separately for each set.
        cols2:  The names of columns containing the chromosome, start and end of the
            genomic intervals, provided separately for each set.
        suffixes: Suffixes for the columns of the two overlapped sets.
        on_cols: List of additional column names to join on. default is None.
        output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
        streaming: **EXPERIMENTAL** If True, use Polars [streaming](features.md#streaming) engine.
        read_options: Additional options for reading the input files.


    Returns:
        **polars.LazyFrame** or polars.DataFrame or pandas.DataFrame of the overlapping intervals.

    Note:
        The default output format, i.e. [LazyFrame](https://docs.pola.rs/api/python/stable/reference/lazyframe/index.html), is recommended for large datasets as it supports output streaming and lazy evaluation.
        This enables efficient processing of large datasets without loading the entire output dataset into memory.

    Example:

    Todo:
        Support for on_cols.
    """

    _validate_overlap_input(
        cols1,
        cols2,
        on_cols,
        suffixes,
        output_type,
        use_zero_based,
    )

    cols1 = DEFAULT_INTERVAL_COLUMNS if cols1 is None else cols1
    cols2 = DEFAULT_INTERVAL_COLUMNS if cols2 is None else cols2
    range_options = RangeOptions(
        range_op=RangeOp.Coverage,
        filter_op=FilterOp.Weak if not use_zero_based else FilterOp.Strict,
        suffixes=suffixes,
        columns_1=cols1,
        columns_2=cols2,
        streaming=streaming,
    )
    return range_operation(df2, df1, range_options, output_type, ctx, read_options)

`merge(df, use_zero_based=False, min_dist=0, cols=['chrom', 'start', 'end'], on_cols=None, output_type='polars.LazyFrame', streaming=False)` `staticmethod`

Merge overlapping intervals. It is assumed that start < end.

Parameters:

Name	Type	Description	Default
`df`	`Union[str, DataFrame, LazyFrame, DataFrame]`	Can be a path to a file, a polars DataFrame, or a pandas DataFrame. CSV with a header, BED and Parquet are supported.	required
`use_zero_based`	`bool`	By default 1-based coordinates system is used, as all input file readers use 1-based coordinates. If enabled, 0-based is used instead and end user is responsible for ensuring that both datasets follow this coordinates system.	`False`
`cols`	`Union[list[str], None]`	The names of columns containing the chromosome, start and end of the genomic intervals, provided separately for each set.	`['chrom', 'start', 'end']`
`on_cols`	`Union[list[str], None]`	List of additional column names for clustering. default is None.	`None`
`output_type`	`str`	Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.	`'polars.LazyFrame'`
`streaming`	`bool`	EXPERIMENTAL If True, use Polars streaming engine.	`False`

Returns:

Type	Description
`Union[LazyFrame, DataFrame, DataFrame, DataFrame]`	polars.LazyFrame or polars.DataFrame or pandas.DataFrame of the overlapping intervals.

Example:

Todo

Support for on_cols.

Source code in polars_bio/range_op.py

@staticmethod
def merge(
    df: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
    use_zero_based: bool = False,
    min_dist: float = 0,
    cols: Union[list[str], None] = ["chrom", "start", "end"],
    on_cols: Union[list[str], None] = None,
    output_type: str = "polars.LazyFrame",
    streaming: bool = False,
) -> Union[pl.LazyFrame, pl.DataFrame, pd.DataFrame, datafusion.DataFrame]:
    """
    Merge overlapping intervals. It is assumed that start < end.


    Parameters:
        df: Can be a path to a file, a polars DataFrame, or a pandas DataFrame. CSV with a header, BED  and Parquet are supported.
        use_zero_based: By default **1-based** coordinates system is used, as all input file readers use 1-based coordinates. If enabled, 0-based is used instead and end user is responsible for ensuring that both datasets follow this coordinates system.
        cols: The names of columns containing the chromosome, start and end of the
            genomic intervals, provided separately for each set.
        on_cols: List of additional column names for clustering. default is None.
        output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
        streaming: **EXPERIMENTAL** If True, use Polars [streaming](features.md#streaming) engine.

    Returns:
        **polars.LazyFrame** or polars.DataFrame or pandas.DataFrame of the overlapping intervals.

    Example:

    Todo:
        Support for on_cols.
    """
    suffixes = ("_1", "_2")
    _validate_overlap_input(
        cols, cols, on_cols, suffixes, output_type, use_zero_based
    )

    my_ctx = get_py_ctx()
    cols = DEFAULT_INTERVAL_COLUMNS if cols is None else cols
    contig = cols[0]
    start = cols[1]
    end = cols[2]

    on_cols = [] if on_cols is None else on_cols
    on_cols = [contig] + on_cols

    df = read_df_to_datafusion(my_ctx, df)
    df_schema = df.schema()
    start_type = df_schema.field(start).type
    end_type = df_schema.field(end).type

    curr_cols = set(df_schema.names)
    start_end = prevent_column_collision("start_end", curr_cols)
    is_start_end = prevent_column_collision("is_start_or_end", curr_cols)
    current_intervals = prevent_column_collision("current_intervals", curr_cols)
    n_intervals = prevent_column_collision("n_intervals", curr_cols)

    end_positions = df.select(
        *(
            [
                (col(end) + min_dist).alias(start_end),
                literal(-1).alias(is_start_end),
            ]
            + on_cols
        )
    )
    start_positions = df.select(
        *([col(start).alias(start_end), literal(1).alias(is_start_end)] + on_cols)
    )
    all_positions = start_positions.union(end_positions)
    start_end_type = all_positions.schema().field(start_end).type
    all_positions = all_positions.select(
        *([col(start_end).cast(start_end_type), col(is_start_end)] + on_cols)
    )

    sorting = [
        col(start_end).sort(),
        col(is_start_end).sort(ascending=use_zero_based),
    ]
    all_positions = all_positions.sort(*sorting)

    on_cols_expr = [col(c) for c in on_cols]

    win = datafusion.expr.Window(
        partition_by=on_cols_expr,
        order_by=sorting,
    )
    all_positions = all_positions.select(
        *(
            [
                start_end,
                is_start_end,
                datafusion.functions.sum(col(is_start_end))
                .over(win)
                .alias(current_intervals),
            ]
            + on_cols
            + [
                datafusion.functions.row_number(
                    partition_by=on_cols_expr, order_by=sorting
                ).alias(n_intervals)
            ]
        )
    )
    all_positions = all_positions.filter(
        ((col(current_intervals) == 0) & (col(is_start_end) == -1))
        | ((col(current_intervals) == 1) & (col(is_start_end) == 1))
    )
    all_positions = all_positions.select(
        *(
            [start_end, is_start_end]
            + on_cols
            + [
                (
                    (
                        col(n_intervals)
                        - datafusion.functions.lag(
                            col(n_intervals), partition_by=on_cols_expr
                        )
                        + 1
                    )
                    / 2
                ).alias(n_intervals)
            ]
        )
    )
    result = all_positions.select(
        *(
            [
                (col(start_end) - min_dist).alias(end),
                is_start_end,
                datafusion.functions.lag(
                    col(start_end), partition_by=on_cols_expr
                ).alias(start),
            ]
            + on_cols
            + [n_intervals]
        )
    )
    result = result.filter(col(is_start_end) == -1)
    result = result.select(
        *(
            [contig, col(start).cast(start_type), col(end).cast(end_type)]
            + on_cols[1:]
            + [n_intervals]
        )
    )

    return convert_result(result, output_type, streaming)

`nearest(df1, df2, use_zero_based=False, suffixes=('_1', '_2'), on_cols=None, cols1=['chrom', 'start', 'end'], cols2=['chrom', 'start', 'end'], output_type='polars.LazyFrame', streaming=False, read_options=None)` `staticmethod`

Find pairs of closest genomic intervals. Bioframe inspired API.

Parameters:

Name	Type	Description	Default
`df1`	`Union[str, DataFrame, LazyFrame, DataFrame]`	Can be a path to a file, a polars DataFrame, or a pandas DataFrame or a registered table (see register_vcf). CSV with a header, BED and Parquet are supported.	required
`df2`	`Union[str, DataFrame, LazyFrame, DataFrame]`	Can be a path to a file, a polars DataFrame, or a pandas DataFrame or a registered table. CSV with a header, BED and Parquet are supported.	required
`use_zero_based`	`bool`	By default 1-based coordinates system is used, as all input file readers use 1-based coordinates. If enabled, 0-based is used instead and end user is responsible for ensuring that both datasets follow this coordinates system.	`False`
`cols1`	`Union[list[str], None]`	The names of columns containing the chromosome, start and end of the genomic intervals, provided separately for each set.	`['chrom', 'start', 'end']`
`cols2`	`Union[list[str], None]`	The names of columns containing the chromosome, start and end of the genomic intervals, provided separately for each set.	`['chrom', 'start', 'end']`
`suffixes`	`tuple[str, str]`	Suffixes for the columns of the two overlapped sets.	`('_1', '_2')`
`on_cols`	`Union[list[str], None]`	List of additional column names to join on. default is None.	`None`
`output_type`	`str`	Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.	`'polars.LazyFrame'`
`streaming`	`bool`	EXPERIMENTAL If True, use Polars streaming engine.	`False`
`read_options`	`Union[ReadOptions, None]`	Additional options for reading the input files.	`None`

Returns:

Type	Description
`Union[LazyFrame, DataFrame, DataFrame, DataFrame]`	polars.LazyFrame or polars.DataFrame or pandas.DataFrame of the overlapping intervals.

Note

The default output format, i.e. LazyFrame, is recommended for large datasets as it supports output streaming and lazy evaluation. This enables efficient processing of large datasets without loading the entire output dataset into memory.

Example:

Todo

Support for on_cols.

Source code in polars_bio/range_op.py

@staticmethod
def nearest(
    df1: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
    df2: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
    use_zero_based: bool = False,
    suffixes: tuple[str, str] = ("_1", "_2"),
    on_cols: Union[list[str], None] = None,
    cols1: Union[list[str], None] = ["chrom", "start", "end"],
    cols2: Union[list[str], None] = ["chrom", "start", "end"],
    output_type: str = "polars.LazyFrame",
    streaming: bool = False,
    read_options: Union[ReadOptions, None] = None,
) -> Union[pl.LazyFrame, pl.DataFrame, pd.DataFrame, datafusion.DataFrame]:
    """
    Find pairs of closest genomic intervals.
    Bioframe inspired API.

    Parameters:
        df1: Can be a path to a file, a polars DataFrame, or a pandas DataFrame or a registered table (see [register_vcf](api.md#polars_bio.register_vcf)). CSV with a header, BED and Parquet are supported.
        df2: Can be a path to a file, a polars DataFrame, or a pandas DataFrame or a registered table. CSV with a header, BED  and Parquet are supported.
        use_zero_based: By default **1-based** coordinates system is used, as all input file readers use 1-based coordinates. If enabled, 0-based is used instead and end user is responsible for ensuring that both datasets follow this coordinates system.
        cols1: The names of columns containing the chromosome, start and end of the
            genomic intervals, provided separately for each set.
        cols2:  The names of columns containing the chromosome, start and end of the
            genomic intervals, provided separately for each set.
        suffixes: Suffixes for the columns of the two overlapped sets.
        on_cols: List of additional column names to join on. default is None.
        output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
        streaming: **EXPERIMENTAL** If True, use Polars [streaming](features.md#streaming) engine.
        read_options: Additional options for reading the input files.


    Returns:
        **polars.LazyFrame** or polars.DataFrame or pandas.DataFrame of the overlapping intervals.

    Note:
        The default output format, i.e. [LazyFrame](https://docs.pola.rs/api/python/stable/reference/lazyframe/index.html), is recommended for large datasets as it supports output streaming and lazy evaluation.
        This enables efficient processing of large datasets without loading the entire output dataset into memory.

    Example:

    Todo:
        Support for on_cols.
    """

    _validate_overlap_input(
        cols1, cols2, on_cols, suffixes, output_type, use_zero_based
    )

    cols1 = DEFAULT_INTERVAL_COLUMNS if cols1 is None else cols1
    cols2 = DEFAULT_INTERVAL_COLUMNS if cols2 is None else cols2
    range_options = RangeOptions(
        range_op=RangeOp.Nearest,
        filter_op=FilterOp.Weak if not use_zero_based else FilterOp.Strict,
        suffixes=suffixes,
        columns_1=cols1,
        columns_2=cols2,
        streaming=streaming,
    )
    return range_operation(df1, df2, range_options, output_type, ctx, read_options)

`overlap(df1, df2, use_zero_based=False, suffixes=('_1', '_2'), on_cols=None, cols1=['chrom', 'start', 'end'], cols2=['chrom', 'start', 'end'], algorithm='Coitrees', output_type='polars.LazyFrame', streaming=False, read_options1=None, read_options2=None)` `staticmethod`

Find pairs of overlapping genomic intervals. Bioframe inspired API.

Parameters:

Name	Type	Description	Default
`df1`	`Union[str, DataFrame, LazyFrame, DataFrame]`	Can be a path to a file, a polars DataFrame, or a pandas DataFrame or a registered table (see register_vcf). CSV with a header, BED and Parquet are supported.	required
`df2`	`Union[str, DataFrame, LazyFrame, DataFrame]`	Can be a path to a file, a polars DataFrame, or a pandas DataFrame or a registered table. CSV with a header, BED and Parquet are supported.	required
`use_zero_based`	`bool`	By default 1-based coordinates system is used, as all input file readers use 1-based coordinates. If enabled, 0-based is used instead and end user is responsible for ensuring that both datasets follow this coordinates system.	`False`
`cols1`	`Union[list[str], None]`	The names of columns containing the chromosome, start and end of the genomic intervals, provided separately for each set.	`['chrom', 'start', 'end']`
`cols2`	`Union[list[str], None]`	The names of columns containing the chromosome, start and end of the genomic intervals, provided separately for each set.	`['chrom', 'start', 'end']`
`suffixes`	`tuple[str, str]`	Suffixes for the columns of the two overlapped sets.	`('_1', '_2')`
`on_cols`	`Union[list[str], None]`	List of additional column names to join on. default is None.	`None`
`algorithm`	`str`	The algorithm to use for the overlap operation. Available options: Coitrees, IntervalTree, ArrayIntervalTree, Lapper	`'Coitrees'`
`output_type`	`str`	Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.	`'polars.LazyFrame'`
`streaming`	`bool`	EXPERIMENTAL If True, use Polars streaming engine.	`False`
`read_options1`	`Union[ReadOptions, None]`	Additional options for reading the input files.	`None`
`read_options2`	`Union[ReadOptions, None]`	Additional options for reading the input files.	`None`

Returns:

Type	Description
`Union[LazyFrame, DataFrame, DataFrame, DataFrame]`	polars.LazyFrame or polars.DataFrame or pandas.DataFrame of the overlapping intervals.

Note

The default output format, i.e. LazyFrame, is recommended for large datasets as it supports output streaming and lazy evaluation. This enables efficient processing of large datasets without loading the entire output dataset into memory.
Streaming is only supported for polars.LazyFrame output.

Example

import polars_bio as pb
import pandas as pd

df1 = pd.DataFrame([
    ['chr1', 1, 5],
    ['chr1', 3, 8],
    ['chr1', 8, 10],
    ['chr1', 12, 14]],
columns=['chrom', 'start', 'end']
)

df2 = pd.DataFrame(
[['chr1', 4, 8],
 ['chr1', 10, 11]],
columns=['chrom', 'start', 'end' ]
)
overlapping_intervals = pb.overlap(df1, df2, output_type="pandas.DataFrame")

overlapping_intervals
    chrom_1         start_1     end_1 chrom_2       start_2  end_2
0     chr1            1          5     chr1            4          8
1     chr1            3          8     chr1            4          8

Todo

Support for on_cols.

Source code in polars_bio/range_op.py

@staticmethod
def overlap(
    df1: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
    df2: Union[str, pl.DataFrame, pl.LazyFrame, pd.DataFrame],
    use_zero_based: bool = False,
    suffixes: tuple[str, str] = ("_1", "_2"),
    on_cols: Union[list[str], None] = None,
    cols1: Union[list[str], None] = ["chrom", "start", "end"],
    cols2: Union[list[str], None] = ["chrom", "start", "end"],
    algorithm: str = "Coitrees",
    output_type: str = "polars.LazyFrame",
    streaming: bool = False,
    read_options1: Union[ReadOptions, None] = None,
    read_options2: Union[ReadOptions, None] = None,
) -> Union[pl.LazyFrame, pl.DataFrame, pd.DataFrame, datafusion.DataFrame]:
    """
    Find pairs of overlapping genomic intervals.
    Bioframe inspired API.

    Parameters:
        df1: Can be a path to a file, a polars DataFrame, or a pandas DataFrame or a registered table (see [register_vcf](api.md#polars_bio.register_vcf)). CSV with a header, BED and Parquet are supported.
        df2: Can be a path to a file, a polars DataFrame, or a pandas DataFrame or a registered table. CSV with a header, BED  and Parquet are supported.
        use_zero_based: By default **1-based** coordinates system is used, as all input file readers use 1-based coordinates. If enabled, 0-based is used instead and end user is responsible for ensuring that both datasets follow this coordinates system.
        cols1: The names of columns containing the chromosome, start and end of the
            genomic intervals, provided separately for each set.
        cols2:  The names of columns containing the chromosome, start and end of the
            genomic intervals, provided separately for each set.
        suffixes: Suffixes for the columns of the two overlapped sets.
        on_cols: List of additional column names to join on. default is None.
        algorithm: The algorithm to use for the overlap operation. Available options: Coitrees, IntervalTree, ArrayIntervalTree, Lapper
        output_type: Type of the output. default is "polars.LazyFrame", "polars.DataFrame", or "pandas.DataFrame" or "datafusion.DataFrame" are also supported.
        streaming: **EXPERIMENTAL** If True, use Polars [streaming](features.md#streaming) engine.
        read_options1: Additional options for reading the input files.
        read_options2: Additional options for reading the input files.

    Returns:
        **polars.LazyFrame** or polars.DataFrame or pandas.DataFrame of the overlapping intervals.

    Note:
        1. The default output format, i.e.  [LazyFrame](https://docs.pola.rs/api/python/stable/reference/lazyframe/index.html), is recommended for large datasets as it supports output streaming and lazy evaluation.
        This enables efficient processing of large datasets without loading the entire output dataset into memory.
        2. Streaming is only supported for polars.LazyFrame output.

    Example:
        ```python
        import polars_bio as pb
        import pandas as pd

        df1 = pd.DataFrame([
            ['chr1', 1, 5],
            ['chr1', 3, 8],
            ['chr1', 8, 10],
            ['chr1', 12, 14]],
        columns=['chrom', 'start', 'end']
        )

        df2 = pd.DataFrame(
        [['chr1', 4, 8],
         ['chr1', 10, 11]],
        columns=['chrom', 'start', 'end' ]
        )
        overlapping_intervals = pb.overlap(df1, df2, output_type="pandas.DataFrame")

        overlapping_intervals
            chrom_1         start_1     end_1 chrom_2       start_2  end_2
        0     chr1            1          5     chr1            4          8
        1     chr1            3          8     chr1            4          8

        ```

    Todo:
         Support for on_cols.
    """

    _validate_overlap_input(
        cols1, cols2, on_cols, suffixes, output_type, use_zero_based
    )

    cols1 = DEFAULT_INTERVAL_COLUMNS if cols1 is None else cols1
    cols2 = DEFAULT_INTERVAL_COLUMNS if cols2 is None else cols2
    range_options = RangeOptions(
        range_op=RangeOp.Overlap,
        filter_op=FilterOp.Weak if not use_zero_based else FilterOp.Strict,
        suffixes=suffixes,
        columns_1=cols1,
        columns_2=cols2,
        overlap_alg=algorithm,
        streaming=streaming,
    )

    return range_operation(
        df1, df2, range_options, output_type, ctx, read_options1, read_options2
    )

`utils`

Source code in polars_bio/range_utils.py

class Utils:
    @staticmethod
    def visualize_intervals(
        df: Union[pd.DataFrame, pl.DataFrame], label: str = "overlapping pair"
    ) -> None:
        """
        Visualize the overlapping intervals.

        Parameters:
            df: Pandas DataFrame or Polars DataFrame. The DataFrame containing the overlapping intervals
            label: TBD

        """
        assert isinstance(
            df, (pd.DataFrame, pl.DataFrame)
        ), "df must be a Pandas or Polars DataFrame"
        df = df if isinstance(df, pd.DataFrame) else df.to_pandas()
        for i, reg_pair in df.iterrows():
            bf.vis.plot_intervals_arr(
                starts=[reg_pair.start_1, reg_pair.start_2],
                ends=[reg_pair.end_1, reg_pair.end_2],
                colors=["skyblue", "lightpink"],
                levels=[2, 1],
                xlim=(0, 16),
                show_coords=True,
            )
            plt.title(f"{label} #{i}")

`visualize_intervals(df, label='overlapping pair')` `staticmethod`

Visualize the overlapping intervals.

Parameters:

Name	Type	Description	Default
`df`	`Union[DataFrame, DataFrame]`	Pandas DataFrame or Polars DataFrame. The DataFrame containing the overlapping intervals	required
`label`	`str`	TBD	`'overlapping pair'`

Source code in polars_bio/range_utils.py

@staticmethod
def visualize_intervals(
    df: Union[pd.DataFrame, pl.DataFrame], label: str = "overlapping pair"
) -> None:
    """
    Visualize the overlapping intervals.

    Parameters:
        df: Pandas DataFrame or Polars DataFrame. The DataFrame containing the overlapping intervals
        label: TBD

    """
    assert isinstance(
        df, (pd.DataFrame, pl.DataFrame)
    ), "df must be a Pandas or Polars DataFrame"
    df = df if isinstance(df, pd.DataFrame) else df.to_pandas()
    for i, reg_pair in df.iterrows():
        bf.vis.plot_intervals_arr(
            starts=[reg_pair.start_1, reg_pair.start_2],
            ends=[reg_pair.end_1, reg_pair.end_2],
            colors=["skyblue", "lightpink"],
            levels=[2, 1],
            xlim=(0, 16),
            show_coords=True,
        )
        plt.title(f"{label} #{i}")

`set_loglevel(level)`

Set the log level for the logger and root logger.

Parameters:

Name	Type	Description	Default
`level`	`str`	The log level to set. Can be "debug", "info", "warn", or "warning".	required

Note

Please note that the log level should be set as a first step after importing the library. Once set it can be only decreased, not increased. In order to increase the log level, you need to restart the Python session.

import polars_bio as pb
pb.set_loglevel("info")

Source code in polars_bio/logging.py

def set_loglevel(level: str):
    """
    Set the log level for the logger and root logger.

    Parameters:
        level: The log level to set. Can be "debug", "info", "warn", or "warning".

    !!! note
        Please note that the log level should be set as a **first** step after importing the library.
        Once set it can be only **decreased**, not increased. In order to increase the log level, you need to restart the Python session.
        ```python
        import polars_bio as pb
        pb.set_loglevel("info")
        ```
    """
    level = level.lower()
    if level == "debug":
        logger.setLevel(logging.DEBUG)
        root_logger.setLevel(logging.DEBUG)
        logging.basicConfig(level=logging.DEBUG)
    elif level == "info":
        logger.setLevel(logging.INFO)
        root_logger.setLevel(logging.INFO)
        logging.basicConfig(level=logging.INFO)
    elif level == "warn" or level == "warning":
        logger.setLevel(logging.WARN)
        root_logger.setLevel(logging.WARN)
        logging.basicConfig(level=logging.WARN)
    else:
        raise ValueError(f"{level} is not a valid log level")

⚙️ API reference

data_input

describe_vcf(path, allow_anonymous=True, enable_request_payer=False, compression_type='auto') staticmethod

from_polars(name, df) staticmethod

read_bam(path, thread_num=1, chunk_size=8, concurrent_fetches=1, allow_anonymous=True, enable_request_payer=False, max_retries=5, timeout=300, streaming=False) staticmethod

read_bed(path, thread_num=1, chunk_size=8, concurrent_fetches=1, allow_anonymous=True, enable_request_payer=False, max_retries=5, timeout=300, compression_type='auto', streaming=False) staticmethod

read_fastq(path, thread_num=1, chunk_size=8, concurrent_fetches=1, allow_anonymous=True, enable_request_payer=False, max_retries=5, timeout=300, compression_type='auto', streaming=False) staticmethod

read_gff(path, attr_fields=None, thread_num=1, chunk_size=8, concurrent_fetches=1, allow_anonymous=True, enable_request_payer=False, max_retries=5, timeout=300, compression_type='auto', streaming=False) staticmethod

read_table(path, schema=None, **kwargs) staticmethod

read_vcf(path, info_fields=None, thread_num=1, chunk_size=8, concurrent_fetches=1, allow_anonymous=True, enable_request_payer=False, max_retries=5, timeout=300, compression_type='auto', streaming=False) staticmethod

data_processing

register_bam(path, name=None, thread_num=1, chunk_size=64, concurrent_fetches=8, allow_anonymous=True, max_retries=5, timeout=300, enable_request_payer=False) staticmethod

register_bed(path, name=None, thread_num=1, chunk_size=64, concurrent_fetches=8, allow_anonymous=True, max_retries=5, timeout=300, enable_request_payer=False, compression_type='auto') staticmethod

register_fastq(path, name=None, thread_num=1, chunk_size=64, concurrent_fetches=8, allow_anonymous=True, max_retries=5, timeout=300, enable_request_payer=False, compression_type='auto') staticmethod

register_gff(path, name=None, attr_fields=None, thread_num=1, chunk_size=64, concurrent_fetches=8, allow_anonymous=True, max_retries=5, timeout=300, enable_request_payer=False, compression_type='auto') staticmethod

register_vcf(path, name=None, info_fields=None, thread_num=1, chunk_size=64, concurrent_fetches=8, allow_anonymous=True, max_retries=5, timeout=300, enable_request_payer=False, compression_type='auto') staticmethod

register_view(name, query) staticmethod

sql(query, streaming=False) staticmethod

range_operations

count_overlaps(df1, df2, use_zero_based=False, suffixes=('', '_'), cols1=['chrom', 'start', 'end'], cols2=['chrom', 'start', 'end'], on_cols=None, output_type='polars.LazyFrame', streaming=False, naive_query=True) staticmethod

coverage(df1, df2, use_zero_based=False, suffixes=('_1', '_2'), on_cols=None, cols1=['chrom', 'start', 'end'], cols2=['chrom', 'start', 'end'], output_type='polars.LazyFrame', streaming=False, read_options=None) staticmethod

merge(df, use_zero_based=False, min_dist=0, cols=['chrom', 'start', 'end'], on_cols=None, output_type='polars.LazyFrame', streaming=False) staticmethod

nearest(df1, df2, use_zero_based=False, suffixes=('_1', '_2'), on_cols=None, cols1=['chrom', 'start', 'end'], cols2=['chrom', 'start', 'end'], output_type='polars.LazyFrame', streaming=False, read_options=None) staticmethod

overlap(df1, df2, use_zero_based=False, suffixes=('_1', '_2'), on_cols=None, cols1=['chrom', 'start', 'end'], cols2=['chrom', 'start', 'end'], algorithm='Coitrees', output_type='polars.LazyFrame', streaming=False, read_options1=None, read_options2=None) staticmethod

utils

visualize_intervals(df, label='overlapping pair') staticmethod

set_loglevel(level)

`data_input`

`describe_vcf(path, allow_anonymous=True, enable_request_payer=False, compression_type='auto')` `staticmethod`

`from_polars(name, df)` `staticmethod`

`read_bam(path, thread_num=1, chunk_size=8, concurrent_fetches=1, allow_anonymous=True, enable_request_payer=False, max_retries=5, timeout=300, streaming=False)` `staticmethod`

`read_bed(path, thread_num=1, chunk_size=8, concurrent_fetches=1, allow_anonymous=True, enable_request_payer=False, max_retries=5, timeout=300, compression_type='auto', streaming=False)` `staticmethod`

`read_fastq(path, thread_num=1, chunk_size=8, concurrent_fetches=1, allow_anonymous=True, enable_request_payer=False, max_retries=5, timeout=300, compression_type='auto', streaming=False)` `staticmethod`

`read_gff(path, attr_fields=None, thread_num=1, chunk_size=8, concurrent_fetches=1, allow_anonymous=True, enable_request_payer=False, max_retries=5, timeout=300, compression_type='auto', streaming=False)` `staticmethod`

`read_table(path, schema=None, **kwargs)` `staticmethod`

`read_vcf(path, info_fields=None, thread_num=1, chunk_size=8, concurrent_fetches=1, allow_anonymous=True, enable_request_payer=False, max_retries=5, timeout=300, compression_type='auto', streaming=False)` `staticmethod`

`data_processing`

`register_bam(path, name=None, thread_num=1, chunk_size=64, concurrent_fetches=8, allow_anonymous=True, max_retries=5, timeout=300, enable_request_payer=False)` `staticmethod`

`register_bed(path, name=None, thread_num=1, chunk_size=64, concurrent_fetches=8, allow_anonymous=True, max_retries=5, timeout=300, enable_request_payer=False, compression_type='auto')` `staticmethod`

`register_fastq(path, name=None, thread_num=1, chunk_size=64, concurrent_fetches=8, allow_anonymous=True, max_retries=5, timeout=300, enable_request_payer=False, compression_type='auto')` `staticmethod`

`register_gff(path, name=None, attr_fields=None, thread_num=1, chunk_size=64, concurrent_fetches=8, allow_anonymous=True, max_retries=5, timeout=300, enable_request_payer=False, compression_type='auto')` `staticmethod`

`register_vcf(path, name=None, info_fields=None, thread_num=1, chunk_size=64, concurrent_fetches=8, allow_anonymous=True, max_retries=5, timeout=300, enable_request_payer=False, compression_type='auto')` `staticmethod`

`register_view(name, query)` `staticmethod`

`sql(query, streaming=False)` `staticmethod`

`range_operations`

`count_overlaps(df1, df2, use_zero_based=False, suffixes=('', '_'), cols1=['chrom', 'start', 'end'], cols2=['chrom', 'start', 'end'], on_cols=None, output_type='polars.LazyFrame', streaming=False, naive_query=True)` `staticmethod`

`coverage(df1, df2, use_zero_based=False, suffixes=('_1', '_2'), on_cols=None, cols1=['chrom', 'start', 'end'], cols2=['chrom', 'start', 'end'], output_type='polars.LazyFrame', streaming=False, read_options=None)` `staticmethod`

`merge(df, use_zero_based=False, min_dist=0, cols=['chrom', 'start', 'end'], on_cols=None, output_type='polars.LazyFrame', streaming=False)` `staticmethod`

`nearest(df1, df2, use_zero_based=False, suffixes=('_1', '_2'), on_cols=None, cols1=['chrom', 'start', 'end'], cols2=['chrom', 'start', 'end'], output_type='polars.LazyFrame', streaming=False, read_options=None)` `staticmethod`

`overlap(df1, df2, use_zero_based=False, suffixes=('_1', '_2'), on_cols=None, cols1=['chrom', 'start', 'end'], cols2=['chrom', 'start', 'end'], algorithm='Coitrees', output_type='polars.LazyFrame', streaming=False, read_options1=None, read_options2=None)` `staticmethod`

`utils`

`visualize_intervals(df, label='overlapping pair')` `staticmethod`

`set_loglevel(level)`