📚Tutorial
Introduction to polars-bio features¶
Interval operations illustrated¶
In [5]:
Copied!
import polars_bio as pb
import pandas as pd
import polars as pl
import bioframe as bf
import polars_bio as pb
import pandas as pd
import polars as pl
import bioframe as bf
In [12]:
Copied!
cols = ["contig", "pos_start", "pos_end"]
cols = ["contig", "pos_start", "pos_end"]
In [2]:
Copied!
df1 = pd.DataFrame(
[["chr1", 1, 5], ["chr1", 3, 8], ["chr1", 8, 10], ["chr1", 12, 14]],
columns=["chrom", "start", "end"],
)
df2 = pd.DataFrame(
[["chr1", 4, 8], ["chr1", 10, 11]], columns=["chrom", "start", "end"]
)
df1 = pd.DataFrame(
[["chr1", 1, 5], ["chr1", 3, 8], ["chr1", 8, 10], ["chr1", 12, 14]],
columns=["chrom", "start", "end"],
)
df2 = pd.DataFrame(
[["chr1", 4, 8], ["chr1", 10, 11]], columns=["chrom", "start", "end"]
)
In [17]:
Copied!
display(df1)
display(df1)
chrom | start | end | |
---|---|---|---|
0 | chr1 | 1 | 5 |
1 | chr1 | 3 | 8 |
2 | chr1 | 8 | 10 |
3 | chr1 | 12 | 14 |
In [18]:
Copied!
display(df2)
display(df2)
chrom | start | end | |
---|---|---|---|
0 | chr1 | 4 | 8 |
1 | chr1 | 10 | 11 |
Overlap¶
In [3]:
Copied!
overlapping_intervals = pb.overlap(df1, df2, output_type="pandas.DataFrame")
overlapping_intervals = pb.overlap(df1, df2, output_type="pandas.DataFrame")
In [4]:
Copied!
display(overlapping_intervals)
display(overlapping_intervals)
chrom_1 | start_1 | end_1 | chrom_2 | start_2 | end_2 | |
---|---|---|---|---|---|---|
0 | chr1 | 1 | 5 | chr1 | 4 | 8 |
1 | chr1 | 3 | 8 | chr1 | 4 | 8 |
2 | chr1 | 8 | 10 | chr1 | 4 | 8 |
3 | chr1 | 8 | 10 | chr1 | 10 | 11 |
In [5]:
Copied!
pb.visualize_intervals(overlapping_intervals)
pb.visualize_intervals(overlapping_intervals)
Nearest¶
In [6]:
Copied!
nearest_intervals = pb.nearest(df1, df2, output_type="pandas.DataFrame")
nearest_intervals = pb.nearest(df1, df2, output_type="pandas.DataFrame")
In [7]:
Copied!
display(nearest_intervals)
display(nearest_intervals)
chrom_1 | start_1 | end_1 | chrom_2 | start_2 | end_2 | distance | |
---|---|---|---|---|---|---|---|
0 | chr1 | 1 | 5 | chr1 | 4 | 8 | 0 |
1 | chr1 | 3 | 8 | chr1 | 4 | 8 | 0 |
2 | chr1 | 8 | 10 | chr1 | 4 | 8 | 0 |
3 | chr1 | 12 | 14 | chr1 | 10 | 11 | 1 |
In [9]:
Copied!
pb.visualize_intervals(nearest_intervals, "nearest pair")
pb.visualize_intervals(nearest_intervals, "nearest pair")
Count overlaps¶
In [3]:
Copied!
count_overlaps = pb.count_overlaps(df1, df2, output_type="pandas.DataFrame")
count_overlaps = pb.count_overlaps(df1, df2, output_type="pandas.DataFrame")
In [4]:
Copied!
display(count_overlaps)
display(count_overlaps)
chrom | start | end | count | |
---|---|---|---|---|
0 | chr1 | 1 | 5 | 1 |
1 | chr1 | 3 | 8 | 1 |
2 | chr1 | 8 | 10 | 2 |
3 | chr1 | 12 | 14 | 0 |
Coverage¶
In [12]:
Copied!
coverage = pb.coverage(df1, df2, output_type="pandas.DataFrame")
coverage = pb.coverage(df1, df2, output_type="pandas.DataFrame")
In [13]:
Copied!
display(coverage)
display(coverage)
chrom | start | end | coverage | |
---|---|---|---|---|
0 | chr1 | 1 | 5 | 2 |
1 | chr1 | 3 | 8 | 4 |
2 | chr1 | 8 | 10 | 2 |
3 | chr1 | 12 | 14 | 0 |
Merge¶
In [19]:
Copied!
merge = pb.merge(df1,output_type="pandas.DataFrame")
merge = pb.merge(df1,output_type="pandas.DataFrame")
In [20]:
Copied!
display(merge)
display(merge)
chrom | start | end | n_intervals | |
---|---|---|---|---|
0 | chr1 | 1 | 10 | 3 |
1 | chr1 | 12 | 14 | 1 |
Reading bioinformatics data¶
GFF¶
Basic reading with attributes unnesting¶
In [46]:
Copied!
gff = pb.read_gff("data/example.gff3.bgz")
gff = pb.read_gff("data/example.gff3.bgz")
In [26]:
Copied!
gff.limit(3).collect()
gff.limit(3).collect()
Out[26]:
shape: (3, 9)
chrom | start | end | type | source | score | strand | phase | attributes |
---|---|---|---|---|---|---|---|---|
str | u32 | u32 | str | str | f32 | str | u32 | list[struct[2]] |
"chr1" | 11869 | 14409 | "gene" | "HAVANA" | null | "+" | null | [{"ID","ENSG00000223972.5"}, {"gene_id","ENSG00000223972.5"}, … {"havana_gene","OTTHUMG00000000961.2"}] |
"chr1" | 11869 | 14409 | "transcript" | "HAVANA" | null | "+" | null | [{"ID","ENST00000456328.2"}, {"Parent","ENSG00000223972.5"}, … {"havana_transcript","OTTHUMT00000362751.1"}] |
"chr1" | 11869 | 12227 | "exon" | "HAVANA" | null | "+" | null | [{"ID","exon:ENST00000456328.2:1"}, {"Parent","ENST00000456328.2"}, … {"havana_transcript","OTTHUMT00000362751.1"}] |
Unnesting attributes¶
In [44]:
Copied!
gff = pb.read_gff("data/example.gff3.bgz", attr_fields=["ID", "havana_transcript"])
gff = pb.read_gff("data/example.gff3.bgz", attr_fields=["ID", "havana_transcript"])
In [45]:
Copied!
gff.limit(3).collect()
gff.limit(3).collect()
Out[45]:
shape: (3, 10)
chrom | start | end | type | source | score | strand | phase | ID | havana_transcript |
---|---|---|---|---|---|---|---|---|---|
str | u32 | u32 | str | str | f32 | str | u32 | str | str |
"chr1" | 11869 | 14409 | "gene" | "HAVANA" | null | "+" | null | "ENSG00000223972.5" | null |
"chr1" | 11869 | 14409 | "transcript" | "HAVANA" | null | "+" | null | "ENST00000456328.2" | "OTTHUMT00000362751.1" |
"chr1" | 11869 | 12227 | "exon" | "HAVANA" | null | "+" | null | "exon:ENST00000456328.2:1" | "OTTHUMT00000362751.1" |
VCF¶
In [43]:
Copied!
pb.read_vcf("data/example.vcf").limit(3).collect()
pb.read_vcf("data/example.vcf").limit(3).collect()
Out[43]:
shape: (2, 8)
chrom | start | end | id | ref | alt | qual | filter |
---|---|---|---|---|---|---|---|
str | u32 | u32 | str | str | str | f64 | str |
"chr21" | 26960070 | 26960070 | "rs116645811" | "G" | "A" | 0.0 | "" |
"chr21" | 26965148 | 26965148 | "rs1135638" | "G" | "A" | 0.0 | "" |
Get info fields available in the VCF file¶
In [42]:
Copied!
pb.describe_vcf("data/example.vcf").sort("name")
pb.describe_vcf("data/example.vcf").sort("name")
Out[42]:
shape: (1, 3)
name | type | description |
---|---|---|
str | str | str |
"CSQ" | "String" | "Consequence annotations from E… |
In [41]:
Copied!
pb.read_vcf("data/example.vcf", info_fields=["CSQ"]).limit(3).collect()
pb.read_vcf("data/example.vcf", info_fields=["CSQ"]).limit(3).collect()
Out[41]:
shape: (2, 9)
chrom | start | end | id | ref | alt | qual | filter | csq |
---|---|---|---|---|---|---|---|---|
str | u32 | u32 | str | str | str | f64 | str | list[str] |
"chr21" | 26960070 | 26960070 | "rs116645811" | "G" | "A" | 0.0 | "" | ["A|missense_variant|MODERATE|MRPL39|ENSG00000154719|Transcript|ENST00000307301|protein_coding|10/11||||1043|1001|334|T/M|aCg/aTg|||-1||HGNC|14027", "A|intron_variant|MODIFIER|MRPL39|ENSG00000154719|Transcript|ENST00000352957|protein_coding||9/9||||||||||-1||HGNC|14027", "A|upstream_gene_variant|MODIFIER|LINC00515|ENSG00000260583|Transcript|ENST00000567517|antisense|||||||||||4432|-1||HGNC|16019"] |
"chr21" | 26965148 | 26965148 | "rs1135638" | "G" | "A" | 0.0 | "" | ["A|synonymous_variant|LOW|MRPL39|ENSG00000154719|Transcript|ENST00000307301|protein_coding|8/11||||939|897|299|G|ggC/ggT|||-1||HGNC|14027", "A|synonymous_variant|LOW|MRPL39|ENSG00000154719|Transcript|ENST00000352957|protein_coding|8/10||||939|897|299|G|ggC/ggT|||-1||HGNC|14027", "A|synonymous_variant|LOW|MRPL39|ENSG00000154719|Transcript|ENST00000419219|protein_coding|8/8||||876|867|289|G|ggC/ggT|||-1|cds_end_NF|HGNC|14027"] |
FASTQ¶
In [39]:
Copied!
pb.read_fastq("data/example.fastq.gz").limit(3).collect()
pb.read_fastq("data/example.fastq.gz").limit(3).collect()
Out[39]:
shape: (3, 4)
name | description | sequence | quality_scores |
---|---|---|---|
str | str | str | str |
"SRR9130495.1" | "D00236:723:HG32CBCX2:1:1108:13… | "NCAATACAAAAGCAATATGGGAGAAGCTAC… | "#4BDFDFFHGHGGJJJHIIIIGGIIJGJJG… |
"SRR9130495.2" | "D00236:723:HG32CBCX2:1:1108:14… | "NGTCAAAGATAAGATCAAAAGGCACTGGCT… | "#1=DDDDD>DHFH@EFHHGHGGFGIIIGIG… |
"SRR9130495.3" | "D00236:723:HG32CBCX2:1:1108:17… | "GTTTTCCTCTGGTTATTTCTAGGTACACTG… | "@@@DDDFFHHHFHBHIIGJIJIIJIIIEHG… |
BAM¶
In [40]:
Copied!
pb.read_bam("data/example.bam").limit(3).collect()
pb.read_bam("data/example.bam").limit(3).collect()
Out[40]:
shape: (3, 11)
name | chrom | start | end | flags | cigar | mapping_quality | mate_chrom | mate_start | sequence | quality_scores |
---|---|---|---|---|---|---|---|---|---|---|
str | str | u32 | u32 | u32 | str | u32 | str | u32 | str | str |
"20FUKAAXX100202:1:21:2075:1360… | "chr1" | 1 | 101 | 1187 | "101M" | 0 | "chr1" | 178 | "TAACCCTAACCCTAACCCTAACCCTAACCC… | "?>=>?@CBC@BBCCDABACCDABBB9:788… |
"20FUKAAXX100202:1:21:2733:1836… | "chr1" | 1 | 101 | 1123 | "101M" | 0 | "chr1" | 183 | "TAACCCTAACCCTAACCCTAACCCTAACCC… | "CCDACCDCDABBDCDABBDCDABBDCDABB… |
"20FUKAAXX100202:1:22:19822:802… | "chr1" | 1 | 101 | 163 | "101M" | 4 | "chr1" | 35 | "TAACCCTAACCCTAACCCTAACCCTAACCC… | "@CC?@@CBC@BBCCDABBCCDABBCCDABB… |
BED¶
In [47]:
Copied!
pb.read_bed("data/example.bed.bgz").limit(3).collect()
pb.read_bed("data/example.bed.bgz").limit(3).collect()
Out[47]:
shape: (3, 4)
chrom | start | end | name |
---|---|---|---|
str | u32 | u32 | str |
"chr16" | 14800001 | 16800000 | "FRA16A" |
"chr16" | 66700001 | 70800000 | "FRA16B" |
"chr16" | 63917503 | 63934965 | "FRA16C" |
SQL processing¶
Registering tables¶
In [5]:
Copied!
pb.register_gff("data/example.gff3.bgz", "example_gff")
pb.register_gff("data/example.gff3.bgz", "example_gff")
In [9]:
Copied!
pb.sql("SHOW TABLES").collect()
pb.sql("SHOW TABLES").collect()
0rows [00:00, ?rows/s]
Out[9]:
shape: (9, 4)
table_catalog | table_schema | table_name | table_type |
---|---|---|---|
str | str | str | str |
"datafusion" | "public" | "s1" | "BASE TABLE" |
"datafusion" | "public" | "count_overlaps_coverage" | "LOCAL TEMPORARY" |
"datafusion" | "public" | "s2" | "BASE TABLE" |
"datafusion" | "public" | "example_gff" | "BASE TABLE" |
"datafusion" | "information_schema" | "tables" | "VIEW" |
"datafusion" | "information_schema" | "views" | "VIEW" |
"datafusion" | "information_schema" | "columns" | "VIEW" |
"datafusion" | "information_schema" | "df_settings" | "VIEW" |
"datafusion" | "information_schema" | "schemata" | "VIEW" |
Querying registered tables¶
In [10]:
Copied!
pb.sql("DESCRIBE example_gff").collect()
pb.sql("DESCRIBE example_gff").collect()
0rows [00:00, ?rows/s]
Out[10]:
shape: (9, 3)
column_name | data_type | is_nullable |
---|---|---|
str | str | str |
"chrom" | "Utf8" | "NO" |
"start" | "UInt32" | "NO" |
"end" | "UInt32" | "NO" |
"type" | "Utf8" | "NO" |
"source" | "Utf8" | "NO" |
"score" | "Float32" | "YES" |
"strand" | "Utf8" | "NO" |
"phase" | "UInt32" | "YES" |
"attributes" | "List(Field { name: "item", dat… | "YES" |
In [12]:
Copied!
pb.sql("SELECT start, end, type FROM example_gff WHERE end = 14409").collect()
pb.sql("SELECT start, end, type FROM example_gff WHERE end = 14409").collect()
0rows [00:00, ?rows/s]
Out[12]:
shape: (2, 3)
start | end | type |
---|---|---|
u32 | u32 | str |
11869 | 14409 | "gene" |
11869 | 14409 | "transcript" |
Registering views¶
In [15]:
Copied!
pb.register_view("v_example_gff", "SELECT start, end, type FROM example_gff WHERE end = 14409")
pb.register_view("v_example_gff", "SELECT start, end, type FROM example_gff WHERE end = 14409")
In [16]:
Copied!
pb.sql("SELECT * FROM v_example_gff").collect()
pb.sql("SELECT * FROM v_example_gff").collect()
0rows [00:00, ?rows/s]
Out[16]:
shape: (2, 3)
start | end | type |
---|---|---|
u32 | u32 | str |
11869 | 14409 | "gene" |
11869 | 14409 | "transcript" |
Working with cloud storage¶
In [20]:
Copied!
pb.describe_vcf("gs://gcp-public-data--gnomad/release/4.1/genome_sv/gnomad.v4.1.sv.sites.vcf.gz", compression_type="bgz").filter(pl.col("description").str.contains(r"Latino.* allele frequency"))
pb.describe_vcf("gs://gcp-public-data--gnomad/release/4.1/genome_sv/gnomad.v4.1.sv.sites.vcf.gz", compression_type="bgz").filter(pl.col("description").str.contains(r"Latino.* allele frequency"))
Out[20]:
shape: (3, 3)
name | type | description |
---|---|---|
str | str | str |
"AF_amr" | "Float" | "Latino allele frequency (biall… |
"AF_amr_XY" | "Float" | "Latino XY allele frequency (bi… |
"AF_amr_XX" | "Float" | "Latino XX allele frequency (bi… |
In [23]:
Copied!
pb.register_vcf("gs://gcp-public-data--gnomad/release/4.1/genome_sv/gnomad.v4.1.sv.sites.vcf.gz", "gnomad",
info_fields=['AF_amr'], compression_type="bgz")
query = """
SELECT
chrom,
start,
end,
alt,
array_element(af_amr,1) AS af_amr
FROM gnomad
WHERE
filter = 'HIGH_NCR'
AND
alt = '<DUP>'
"""
pb.sql(f"{query} LIMIT 3").collect()
pb.register_vcf("gs://gcp-public-data--gnomad/release/4.1/genome_sv/gnomad.v4.1.sv.sites.vcf.gz", "gnomad",
info_fields=['AF_amr'], compression_type="bgz")
query = """
SELECT
chrom,
start,
end,
alt,
array_element(af_amr,1) AS af_amr
FROM gnomad
WHERE
filter = 'HIGH_NCR'
AND
alt = ''
"""
pb.sql(f"{query} LIMIT 3").collect()
0rows [00:00, ?rows/s]
Out[23]:
shape: (3, 5)
chrom | start | end | alt | af_amr |
---|---|---|---|---|
str | u32 | u32 | str | f32 |
"chr1" | 10000 | 295666 | "<DUP>" | 0.000293 |
"chr1" | 138000 | 144000 | "<DUP>" | 0.000166 |
"chr1" | 160500 | 172100 | "<DUP>" | 0.002639 |
Overlapping with a local dataframe¶
In [2]:
Copied!
pb.register_vcf("s3://gnomad-public-us-east-1/release/4.1/vcf/exomes/gnomad.exomes.v4.1.sites.chr1.vcf.bgz", "gnomad_sites_chr1")
pb.register_vcf("s3://gnomad-public-us-east-1/release/4.1/vcf/exomes/gnomad.exomes.v4.1.sites.chr1.vcf.bgz", "gnomad_sites_chr1")
In [3]:
Copied!
pb.sql("SELECT chrom, start, end, alt FROM gnomad_sites_chr1 LIMIT 10").collect()
pb.sql("SELECT chrom, start, end, alt FROM gnomad_sites_chr1 LIMIT 10").collect()
0rows [00:00, ?rows/s]
Out[3]:
shape: (10, 4)
chrom | start | end | alt |
---|---|---|---|
str | u32 | u32 | str |
"chr1" | 11994 | 11994 | "C" |
"chr1" | 12016 | 12016 | "A" |
"chr1" | 12060 | 12065 | "C" |
"chr1" | 12074 | 12074 | "C" |
"chr1" | 12102 | 12102 | "A" |
"chr1" | 12106 | 12106 | "G" |
"chr1" | 12138 | 12138 | "A" |
"chr1" | 12158 | 12158 | "T" |
"chr1" | 12165 | 12165 | "A" |
"chr1" | 12168 | 12168 | "G" |
In [9]:
Copied!
pb.register_view("v_gnomad_sites_chr1", "SELECT * FROM gnomad_sites_chr1 LIMIT 20")
pb.register_view("v_gnomad_sites_chr1", "SELECT * FROM gnomad_sites_chr1 LIMIT 20")
In [10]:
Copied!
pb.sql("SELECT * FROM v_gnomad_sites_chr1").collect()
pb.sql("SELECT * FROM v_gnomad_sites_chr1").collect()
0rows [00:00, ?rows/s]
Out[10]:
shape: (20, 8)
chrom | start | end | id | ref | alt | qual | filter |
---|---|---|---|---|---|---|---|
str | u32 | u32 | str | str | str | f64 | str |
"chr1" | 11994 | 11994 | "" | "T" | "C" | 0.0 | "AC0;AS_VQSR" |
"chr1" | 12016 | 12016 | "" | "G" | "A" | 0.0 | "AC0;AS_VQSR" |
"chr1" | 12060 | 12065 | "" | "CTGGAG" | "C" | 0.0 | "AC0;AS_VQSR" |
"chr1" | 12074 | 12074 | "" | "T" | "C" | 0.0 | "AC0;AS_VQSR" |
"chr1" | 12102 | 12102 | "" | "G" | "A" | 0.0 | "AC0;AS_VQSR" |
… | … | … | … | … | … | … | … |
"chr1" | 12198 | 12198 | "rs62635282" | "G" | "C" | 0.0 | "AS_VQSR" |
"chr1" | 12201 | 12201 | "" | "C" | "G" | 0.0 | "AS_VQSR" |
"chr1" | 12214 | 12214 | "rs202068986" | "C" | "G" | 0.0 | "AC0" |
"chr1" | 12225 | 12225 | "" | "C" | "T" | 0.0 | "AS_VQSR" |
"chr1" | 12235 | 12235 | "" | "G" | "A" | 0.0 | "AS_VQSR" |
In [11]:
Copied!
df = pl.DataFrame({
"chrom": ["chr1", "chr1"],
"start": [11993, 12102],
"end": [11996, 12200],
"annotation": ["ann1", "ann2"]
})
pb.from_polars("test_annotation", df)
pb.sql("SELECT * FROM test_annotation").collect()
df = pl.DataFrame({
"chrom": ["chr1", "chr1"],
"start": [11993, 12102],
"end": [11996, 12200],
"annotation": ["ann1", "ann2"]
})
pb.from_polars("test_annotation", df)
pb.sql("SELECT * FROM test_annotation").collect()
0rows [00:00, ?rows/s]
Out[11]:
shape: (2, 4)
chrom | start | end | annotation |
---|---|---|---|
str | i64 | i64 | str |
"chr1" | 11993 | 11996 | "ann1" |
"chr1" | 12102 | 12200 | "ann2" |
In [12]:
Copied!
pb.overlap("v_gnomad_sites_chr1","test_annotation").limit(3).collect()
pb.overlap("v_gnomad_sites_chr1","test_annotation").limit(3).collect()
0rows [00:00, ?rows/s]
Out[12]:
shape: (3, 12)
chrom_1 | start_1 | end_1 | chrom_2 | start_2 | end_2 | annotation_1 | id_2 | ref_2 | alt_2 | qual_2 | filter_2 |
---|---|---|---|---|---|---|---|---|---|---|---|
str | u32 | u32 | str | i64 | i64 | str | str | str | str | f64 | str |
"chr1" | 11994 | 11994 | "chr1" | 11993 | 11996 | "ann1" | "" | "T" | "C" | 0.0 | "AC0;AS_VQSR" |
"chr1" | 12102 | 12102 | "chr1" | 12102 | 12200 | "ann2" | "" | "G" | "A" | 0.0 | "AC0;AS_VQSR" |
"chr1" | 12106 | 12106 | "chr1" | 12102 | 12200 | "ann2" | "" | "T" | "G" | 0.0 | "AC0;AS_VQSR" |
Parallel processing¶
These demo datasets are from databio.zip benchmark.
In [54]:
Copied!
pb.set_option("datafusion.execution.target_partitions", "1")
pb.set_option("datafusion.execution.target_partitions", "1")
In [55]:
Copied!
%%time
pb.overlap("data/ex-rna/*.parquet", "data/chainRn4/*.parquet", cols1=cols, cols2=cols).collect().count()
%%time
pb.overlap("data/ex-rna/*.parquet", "data/chainRn4/*.parquet", cols1=cols, cols2=cols).collect().count()
0rows [00:00, ?rows/s]
CPU times: user 3.25 s, sys: 266 ms, total: 3.51 s Wall time: 3.49 s
Out[55]:
shape: (1, 6)
contig_1 | pos_start_1 | pos_end_1 | contig_2 | pos_start_2 | pos_end_2 |
---|---|---|---|---|---|
u32 | u32 | u32 | u32 | u32 | u32 |
164214743 | 164214743 | 164214743 | 164214743 | 164214743 | 164214743 |
In [8]:
Copied!
pb.set_option("datafusion.execution.target_partitions", "2")
pb.set_option("datafusion.execution.target_partitions", "2")
In [9]:
Copied!
%%time
pb.overlap("data/ex-rna/*.parquet", "data/chainRn4/*.parquet", cols1=cols, cols2=cols).collect().count()
%%time
pb.overlap("data/ex-rna/*.parquet", "data/chainRn4/*.parquet", cols1=cols, cols2=cols).collect().count()
0rows [00:00, ?rows/s]
CPU times: user 3.11 s, sys: 251 ms, total: 3.36 s Wall time: 1.69 s
Out[9]:
shape: (1, 6)
contig_1 | pos_start_1 | pos_end_1 | contig_2 | pos_start_2 | pos_end_2 |
---|---|---|---|---|---|
u32 | u32 | u32 | u32 | u32 | u32 |
164214743 | 164214743 | 164214743 | 164214743 | 164214743 | 164214743 |
In [58]:
Copied!
pb.set_option("datafusion.execution.target_partitions", "1")
pb.set_option("datafusion.execution.target_partitions", "1")
Streaming¶
Make sure you restart the kernel before running the next cells. memory-profiler is required.
In [3]:
Copied!
%load_ext memory_profiler
%load_ext memory_profiler
Overlap in the streaming mode¶
In [4]:
Copied!
%memit pb.overlap("data/ex-rna/*.parquet", "data/chainRn4/*.parquet", cols1=cols, cols2=cols, streaming=True).sink_parquet("/tmp/overlap.parquet")
%memit pb.overlap("data/ex-rna/*.parquet", "data/chainRn4/*.parquet", cols1=cols, cols2=cols, streaming=True).sink_parquet("/tmp/overlap.parquet")
peak memory: 797.94 MiB, increment: 586.89 MiB
Overlap with materialization¶
In [4]:
Copied!
%memit pb.overlap("data/ex-rna/*.parquet", "data/chainRn4/*.parquet", cols1=cols, cols2=cols).collect().write_parquet("/tmp/overlap.parquet")
%memit pb.overlap("data/ex-rna/*.parquet", "data/chainRn4/*.parquet", cols1=cols, cols2=cols).collect().write_parquet("/tmp/overlap.parquet")
0rows [00:00, ?rows/s]
peak memory: 15900.17 MiB, increment: 15684.67 MiB
Working with zero-based coordinates¶
In [11]:
Copied!
BIO_PD_DF1 = pd.read_parquet(f"data/exons/")
BIO_PD_DF2 = pd.read_parquet(f"data/fBrain-DS14718/")
BIO_PD_DF1 = pd.read_parquet(f"data/exons/")
BIO_PD_DF2 = pd.read_parquet(f"data/fBrain-DS14718/")
In [28]:
Copied!
bf_overlap = bf.overlap(
BIO_PD_DF1,
BIO_PD_DF2,
cols1=cols,
cols2=cols,
suffixes=("_1", "_2"),
how="inner",
)
bf_overlap = bf.overlap(
BIO_PD_DF1,
BIO_PD_DF2,
cols1=cols,
cols2=cols,
suffixes=("_1", "_2"),
how="inner",
)
In [29]:
Copied!
pb_overlap = pb.overlap(
BIO_PD_DF1,
BIO_PD_DF2,
cols1=cols,
cols2=cols,
output_type="pandas.DataFrame",
suffixes=("_1", "_2"),
)
pb_overlap = pb.overlap(
BIO_PD_DF1,
BIO_PD_DF2,
cols1=cols,
cols2=cols,
output_type="pandas.DataFrame",
suffixes=("_1", "_2"),
)
Since polars-bio is one-based, the assertion will fail.
In [30]:
Copied!
assert len(bf_overlap) == len(pb_overlap)
assert len(bf_overlap) == len(pb_overlap)
--------------------------------------------------------------------------- AssertionError Traceback (most recent call last) Cell In[30], line 1 ----> 1 assert len(bf_overlap) == len(pb_overlap) AssertionError:
To use polars-bio with zero-based coordinates, set use_zero_based=True
:
In [33]:
Copied!
pb_overlap = pb.overlap(
BIO_PD_DF1,
BIO_PD_DF2,
cols1=cols,
cols2=cols,
output_type="pandas.DataFrame",
suffixes=("_1", "_2"),
use_zero_based=True,
)
pb_overlap = pb.overlap(
BIO_PD_DF1,
BIO_PD_DF2,
cols1=cols,
cols2=cols,
output_type="pandas.DataFrame",
suffixes=("_1", "_2"),
use_zero_based=True,
)
WARNING:polars_bio:0-based coordinate system was selected. Please ensure that both datasets follow this coordinate system.
In [34]:
Copied!
assert len(bf_overlap) == len(pb_overlap)
assert len(bf_overlap) == len(pb_overlap)
Example - gnomAD VCF file reading¶
In [1]:
Copied!
import polars_bio as pb
import polars as pl
import polars_bio as pb
import polars as pl
In [5]:
Copied!
gcs_vcf_path = (
"gs://gcp-public-data--gnomad/release/4.1/genome_sv/gnomad.v4.1.sv.sites.vcf.gz"
)
gcs_vcf_path = (
"gs://gcp-public-data--gnomad/release/4.1/genome_sv/gnomad.v4.1.sv.sites.vcf.gz"
)
In [6]:
Copied!
# we need to override the compression type as the file is bgzipped not gzipped as the extension suggests
pb.read_vcf(gcs_vcf_path, compression_type="bgz").limit(3).collect()
# we need to override the compression type as the file is bgzipped not gzipped as the extension suggests
pb.read_vcf(gcs_vcf_path, compression_type="bgz").limit(3).collect()
Out[6]:
shape: (3, 8)
chrom | start | end | id | ref | alt | qual | filter |
---|---|---|---|---|---|---|---|
str | u32 | u32 | str | str | str | f64 | str |
"chr1" | 10000 | 295666 | "gnomAD-SV_v3_DUP_chr1_01c2781c" | "N" | "<DUP>" | 134.0 | "HIGH_NCR" |
"chr1" | 10434 | 10434 | "gnomAD-SV_v3_BND_chr1_1a45f73a" | "N" | "<BND>" | 260.0 | "HIGH_NCR;UNRESOLVED" |
"chr1" | 10440 | 10440 | "gnomAD-SV_v3_BND_chr1_3fa36917" | "N" | "<BND>" | 198.0 | "HIGH_NCR;UNRESOLVED" |
In [7]:
Copied!
pb.describe_vcf(gcs_vcf_path, compression_type="bgz").sort("name").limit(5)
pb.describe_vcf(gcs_vcf_path, compression_type="bgz").sort("name").limit(5)
Out[7]:
shape: (5, 3)
name | type | description |
---|---|---|
str | str | str |
"AC" | "Integer" | "Number of non-reference allele… |
"AC_XX" | "Integer" | "Number of non-reference XX all… |
"AC_XY" | "Integer" | "Number of non-reference XY all… |
"AC_afr" | "Integer" | "Number of non-reference Africa… |
"AC_afr_XX" | "Integer" | "Number of non-reference Africa… |
AWS S3¶
In [8]:
Copied!
# here we can use automatic compression detection
aws_s3_vcf_path = "s3://gnomad-public-us-east-1/release/4.1/vcf/exomes/gnomad.exomes.v4.1.sites.chr21.vcf.bgz"
# here we can use automatic compression detection
aws_s3_vcf_path = "s3://gnomad-public-us-east-1/release/4.1/vcf/exomes/gnomad.exomes.v4.1.sites.chr21.vcf.bgz"
In [9]:
Copied!
pb.read_vcf(aws_s3_vcf_path, chunk_size=8, concurrent_fetches=1).limit(3).collect()
pb.read_vcf(aws_s3_vcf_path, chunk_size=8, concurrent_fetches=1).limit(3).collect()
Out[9]:
shape: (3, 8)
chrom | start | end | id | ref | alt | qual | filter |
---|---|---|---|---|---|---|---|
str | u32 | u32 | str | str | str | f64 | str |
"chr21" | 5031905 | 5031905 | "" | "C" | "A" | 0.0 | "AC0;AS_VQSR" |
"chr21" | 5031905 | 5031905 | "" | "C" | "T" | 0.0 | "AC0;AS_VQSR" |
"chr21" | 5031909 | 5031909 | "" | "T" | "C" | 0.0 | "AC0;AS_VQSR" |
2. How to specify additional VCF INFO fields to be parsed¶
In [10]:
Copied!
vcf_info_fields = ["SVTYPE", "SVLEN"]
pb.read_vcf(gcs_vcf_path, info_fields=vcf_info_fields, compression_type="bgz").limit(3).collect()
vcf_info_fields = ["SVTYPE", "SVLEN"]
pb.read_vcf(gcs_vcf_path, info_fields=vcf_info_fields, compression_type="bgz").limit(3).collect()
Out[10]:
shape: (3, 10)
chrom | start | end | id | ref | alt | qual | filter | svtype | svlen |
---|---|---|---|---|---|---|---|---|---|
str | u32 | u32 | str | str | str | f64 | str | str | i32 |
"chr1" | 10000 | 295666 | "gnomAD-SV_v3_DUP_chr1_01c2781c" | "N" | "<DUP>" | 134.0 | "HIGH_NCR" | "DUP" | 285666 |
"chr1" | 10434 | 10434 | "gnomAD-SV_v3_BND_chr1_1a45f73a" | "N" | "<BND>" | 260.0 | "HIGH_NCR;UNRESOLVED" | "BND" | -1 |
"chr1" | 10440 | 10440 | "gnomAD-SV_v3_BND_chr1_3fa36917" | "N" | "<BND>" | 198.0 | "HIGH_NCR;UNRESOLVED" | "BND" | -1 |
3. How to spead up reading local VCF files with multiple threads¶
In [ ]:
Copied!
! gsutil -m cp $gcs_vcf_path /tmp/gnomad.v4.1.sv.sites.vcf.gz &> /dev/null
! gsutil -m cp $gcs_vcf_path /tmp/gnomad.v4.1.sv.sites.vcf.gz &> /dev/null
In [3]:
Copied!
%%time
pb.read_vcf("/tmp/gnomad.v4.1.sv.sites.vcf.gz", thread_num=1, compression_type="bgz").count().collect()
%%time
pb.read_vcf("/tmp/gnomad.v4.1.sv.sites.vcf.gz", thread_num=1, compression_type="bgz").count().collect()
0rows [00:00, ?rows/s]
CPU times: user 13.7 s, sys: 2.13 s, total: 15.8 s Wall time: 11.9 s
Out[3]:
shape: (1, 8)
chrom | start | end | id | ref | alt | qual | filter |
---|---|---|---|---|---|---|---|
u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 |
2154486 | 2154486 | 2154486 | 2154486 | 2154486 | 2154486 | 2154486 | 2154486 |
In [4]:
Copied!
%%time
pb.read_vcf("/tmp/gnomad.v4.1.sv.sites.vcf.gz", thread_num=4, compression_type="bgz").count().collect()
%%time
pb.read_vcf("/tmp/gnomad.v4.1.sv.sites.vcf.gz", thread_num=4, compression_type="bgz").count().collect()
0rows [00:00, ?rows/s]
CPU times: user 13 s, sys: 1.73 s, total: 14.8 s Wall time: 3.42 s
Out[4]:
shape: (1, 8)
chrom | start | end | id | ref | alt | qual | filter |
---|---|---|---|---|---|---|---|
u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 |
2154486 | 2154486 | 2154486 | 2154486 | 2154486 | 2154486 | 2154486 | 2154486 |
4. How to perform an overlap operation on two remote VCF files in streaming mode¶
In [11]:
Copied!
vcf_1 = "gs://gcp-public-data--gnomad/release/4.1/genome_sv/gnomad.v4.1.sv.sites.vcf.gz"
vcf_2 = "gs://gcp-public-data--gnomad/release/4.1/vcf/exomes/gnomad.exomes.v4.1.sites.chr21.vcf.bgz"
vcf_1 = "gs://gcp-public-data--gnomad/release/4.1/genome_sv/gnomad.v4.1.sv.sites.vcf.gz"
vcf_2 = "gs://gcp-public-data--gnomad/release/4.1/vcf/exomes/gnomad.exomes.v4.1.sites.chr21.vcf.bgz"
In [15]:
Copied!
object_storage_options = pb.ObjectStorageOptions(
allow_anonymous=True,
enable_request_payer=False,
chunk_size=64,
concurrent_fetches=8,
max_retries=5,
timeout=10,
compression_type="bgz",
)
vcf_read_options_1 = pb.VcfReadOptions(
info_fields=["SVTYPE", "SVLEN"],
thread_num=1,
object_storage_options=object_storage_options,
)
vcf_read_options_2 = pb.VcfReadOptions(
object_storage_options=object_storage_options,
)
read_options_1 = pb.ReadOptions(vcf_read_options=vcf_read_options_1)
read_options_2 = pb.ReadOptions(vcf_read_options=vcf_read_options_2)
object_storage_options = pb.ObjectStorageOptions(
allow_anonymous=True,
enable_request_payer=False,
chunk_size=64,
concurrent_fetches=8,
max_retries=5,
timeout=10,
compression_type="bgz",
)
vcf_read_options_1 = pb.VcfReadOptions(
info_fields=["SVTYPE", "SVLEN"],
thread_num=1,
object_storage_options=object_storage_options,
)
vcf_read_options_2 = pb.VcfReadOptions(
object_storage_options=object_storage_options,
)
read_options_1 = pb.ReadOptions(vcf_read_options=vcf_read_options_1)
read_options_2 = pb.ReadOptions(vcf_read_options=vcf_read_options_2)
In [16]:
Copied!
pb.overlap(vcf_1, vcf_2, streaming=True, read_options1=read_options_1, read_options2=read_options_2).sink_csv(
"/tmp/streaming_run.csv"
)
pb.overlap(vcf_1, vcf_2, streaming=True, read_options1=read_options_1, read_options2=read_options_2).sink_csv(
"/tmp/streaming_run.csv"
)
In [19]:
Copied!
pl.read_csv("/tmp/streaming_run.csv").limit(3)
pl.read_csv("/tmp/streaming_run.csv").limit(3)
Out[19]:
shape: (3, 18)
chrom_1 | start_1 | end_1 | chrom_2 | start_2 | end_2 | id_1 | ref_1 | alt_1 | qual_1 | filter_1 | id_2 | ref_2 | alt_2 | qual_2 | filter_2 | svtype_2 | svlen_2 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
str | i64 | i64 | str | i64 | i64 | str | str | str | f64 | str | str | str | str | f64 | str | str | i64 |
"chr21" | 5019150 | 5047500 | "chr21" | 5036183 | 5036183 | "" | "A" | "C" | 0.0 | "AC0;AS_VQSR" | "gnomAD-SV_v3_DUP_chr21_029eb66… | "N" | "<DUP>" | 34.0 | "PASS" | "DUP" | 28350 |
"chr21" | 5019150 | 5047500 | "chr21" | 5036184 | 5036184 | "" | "G" | "A" | 0.0 | "AS_VQSR" | "gnomAD-SV_v3_DUP_chr21_029eb66… | "N" | "<DUP>" | 34.0 | "PASS" | "DUP" | 28350 |
"chr21" | 5019150 | 5047500 | "chr21" | 5036185 | 5036185 | "" | "G" | "A" | 0.0 | "AS_VQSR" | "gnomAD-SV_v3_DUP_chr21_029eb66… | "N" | "<DUP>" | 34.0 | "PASS" | "DUP" | 28350 |
In [18]:
Copied!
pb.overlap(vcf_1, vcf_2, streaming=False, read_options1=read_options_1, read_options2=read_options_2).collect().count()
pb.overlap(vcf_1, vcf_2, streaming=False, read_options1=read_options_1, read_options2=read_options_2).collect().count()
0rows [00:00, ?rows/s]
Out[18]:
shape: (1, 18)
chrom_1 | start_1 | end_1 | chrom_2 | start_2 | end_2 | id_1 | ref_1 | alt_1 | qual_1 | filter_1 | id_2 | ref_2 | alt_2 | qual_2 | filter_2 | svtype_2 | svlen_2 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 |
17882545 | 17882545 | 17882545 | 17882545 | 17882545 | 17882545 | 17882545 | 17882545 | 17882545 | 17882545 | 17882545 | 17882545 | 17882545 | 17882545 | 17882545 | 17882545 | 17882545 | 17882545 |
5. How to read a VCF from Google Life Sciences¶
In [20]:
Copied!
gcs_vcf_path = "gs://genomics-public-data/platinum-genomes/vcf/NA12878_S1.genome.vcf"
gcs_vcf_path = "gs://genomics-public-data/platinum-genomes/vcf/NA12878_S1.genome.vcf"
In [21]:
Copied!
info_fields=["AC", "AF"]
info_fields=["AC", "AF"]
In [22]:
Copied!
pb.read_vcf(gcs_vcf_path, info_fields=info_fields).limit(3).collect()
pb.read_vcf(gcs_vcf_path, info_fields=info_fields).limit(3).collect()
Out[22]:
shape: (3, 10)
chrom | start | end | id | ref | alt | qual | filter | ac | af |
---|---|---|---|---|---|---|---|---|---|
str | u32 | u32 | str | str | str | f64 | str | list[i32] | list[f32] |
"chrM" | 1 | 1 | "" | "G" | "" | 0.0 | "PASS" | null | null |
"chrM" | 2 | 72 | "" | "A" | "" | 0.0 | "PASS" | null | null |
"chrM" | 73 | 73 | "" | "G" | "A" | 8752.780273 | "TruthSensitivityTranche99.90to… | [2] | [1.0] |
6. SQL data processing¶
Check SQL reference for details.
In [24]:
Copied!
pb.register_vcf(vcf_1, "gnomad_sv", thread_num=1, info_fields=["SVTYPE", "SVLEN"], compression_type="bgz")
pb.register_vcf(vcf_1, "gnomad_sv", thread_num=1, info_fields=["SVTYPE", "SVLEN"], compression_type="bgz")
In [25]:
Copied!
pb.sql("SELECT chrom, svtype FROM gnomad_sv").limit(3).collect()
pb.sql("SELECT chrom, svtype FROM gnomad_sv").limit(3).collect()
Out[25]:
shape: (3, 2)
chrom | svtype |
---|---|
str | str |
"chr1" | "DUP" |
"chr1" | "BND" |
"chr1" | "BND" |
In [26]:
Copied!
pb.sql("SELECT * FROM gnomad_sv WHERE SVTYPE = 'DEL' AND SVLEN > 1000").limit(3).collect()
pb.sql("SELECT * FROM gnomad_sv WHERE SVTYPE = 'DEL' AND SVLEN > 1000").limit(3).collect()
Out[26]:
shape: (3, 10)
chrom | start | end | id | ref | alt | qual | filter | svtype | svlen |
---|---|---|---|---|---|---|---|---|---|
str | u32 | u32 | str | str | str | f64 | str | str | i32 |
"chr1" | 22000 | 30000 | "gnomAD-SV_v3_DEL_chr1_fa103016" | "N" | "<DEL>" | 999.0 | "HIGH_NCR" | "DEL" | 8000 |
"chr1" | 40000 | 47000 | "gnomAD-SV_v3_DEL_chr1_b26f63f7" | "N" | "<DEL>" | 145.0 | "PASS" | "DEL" | 7000 |
"chr1" | 79086 | 88118 | "gnomAD-SV_v3_DEL_chr1_733c4ef0" | "N" | "<DEL:ME:LINE1>" | 344.0 | "UNRESOLVED" | "DEL" | 9032 |
In [27]:
Copied!
pb.sql("SELECT alt, count(*) as cnt FROM gnomad_sv group by alt").collect_schema()
pb.sql("SELECT alt, count(*) as cnt FROM gnomad_sv group by alt").collect_schema()
Out[27]:
Schema([('alt', String), ('cnt', Int64)])
In [28]:
Copied!
pb.sql("SELECT alt, count(*) as cnt FROM gnomad_sv group by alt").collect()
pb.sql("SELECT alt, count(*) as cnt FROM gnomad_sv group by alt").collect()
0rows [00:00, ?rows/s]
Out[28]:
shape: (13, 2)
alt | cnt |
---|---|
str | i64 |
"<DUP>" | 269326 |
"<BND>" | 356035 |
"<CNV>" | 721 |
"<DEL>" | 1197080 |
"<INS>" | 83441 |
… | … |
"<INS:ME:SVA>" | 17607 |
"<CPX>" | 15189 |
"<INV>" | 2193 |
"<DEL:ME:HERVK>" | 693 |
"<CTX>" | 99 |
In [29]:
Copied!
pb.sql("SELECT chrom, count(*) as cnt FROM gnomad_sv GROUP BY chrom ORDER BY chrom").collect()
pb.sql("SELECT chrom, count(*) as cnt FROM gnomad_sv GROUP BY chrom ORDER BY chrom").collect()
Out[29]:
shape: (24, 2)
chrom | cnt |
---|---|
str | i64 |
"chr1" | 182804 |
"chr10" | 96755 |
"chr11" | 95690 |
"chr12" | 97655 |
"chr13" | 63839 |
… | … |
"chr7" | 131866 |
"chr8" | 101224 |
"chr9" | 87748 |
"chrX" | 78076 |
"chrY" | 12488 |
In [30]:
Copied!
pb.sql("SELECT * FROM gnomad_sv WHERE chrom='chr1'", streaming=True).sink_csv("/tmp/gnomad_chr1.csv")
pb.sql("SELECT * FROM gnomad_sv WHERE chrom='chr1'", streaming=True).sink_csv("/tmp/gnomad_chr1.csv")
In [31]:
Copied!
pl.read_csv("/tmp/gnomad_chr1.csv").count()
pl.read_csv("/tmp/gnomad_chr1.csv").count()
Out[31]:
shape: (1, 10)
chrom | start | end | id | ref | alt | qual | filter | svtype | svlen |
---|---|---|---|---|---|---|---|---|---|
u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 |
182804 | 182804 | 182804 | 182804 | 182804 | 182804 | 182804 | 182804 | 182804 | 182804 |
In [32]:
Copied!
pb.register_vcf(vcf_2, "gnomad_exomes", info_fields=["AC", "AF"])
pb.register_vcf(vcf_2, "gnomad_exomes", info_fields=["AC", "AF"])
In [33]:
Copied!
pb.sql("SELECT replace(chrom,'chr','') AS chrom, start, ac,af FROM gnomad_exomes WHERE array_element(af,1)>0.01").limit(10).collect()
pb.sql("SELECT replace(chrom,'chr','') AS chrom, start, ac,af FROM gnomad_exomes WHERE array_element(af,1)>0.01").limit(10).collect()
Out[33]:
shape: (10, 4)
chrom | start | ac | af |
---|---|---|---|
str | u32 | list[i32] | list[f32] |
"21" | 5033364 | [372992] | [0.337086] |
"21" | 5033539 | [1107064] | [0.996312] |
"21" | 5034629 | [15145] | [0.020082] |
"21" | 5035021 | [1811] | [0.307888] |
"21" | 5035108 | [2] | [0.010989] |
"21" | 5035658 | [255555] | [0.336447] |
"21" | 5035846 | [37233] | [0.286906] |
"21" | 5035921 | [1682] | [0.010757] |
"21" | 5116593 | [4032] | [0.018626] |
"21" | 5116760 | [101] | [0.014914] |
In [34]:
Copied!
pb.overlap("gnomad_sv", "gnomad_exomes").collect().count()
pb.overlap("gnomad_sv", "gnomad_exomes").collect().count()
0rows [00:00, ?rows/s]
Out[34]:
shape: (1, 20)
chrom_1 | start_1 | end_1 | chrom_2 | start_2 | end_2 | id_1 | ref_1 | alt_1 | qual_1 | filter_1 | ac_1 | af_1 | id_2 | ref_2 | alt_2 | qual_2 | filter_2 | svtype_2 | svlen_2 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 | u32 |
17882545 | 17882545 | 17882545 | 17882545 | 17882545 | 17882545 | 17882545 | 17882545 | 17882545 | 17882545 | 17882545 | 17882545 | 17587503 | 17882545 | 17882545 | 17882545 | 17882545 | 17882545 | 17882545 | 17882545 |
In [35]:
Copied!
pb.register_vcf(vcf_2, "gnomad_exomes")
pb.register_vcf(vcf_2, "gnomad_exomes")
In [36]:
Copied!
pb.overlap("gnomad_sv", "gnomad_exomes", streaming=True).sink_csv("/tmp/overlap.csv")
pb.overlap("gnomad_sv", "gnomad_exomes", streaming=True).sink_csv("/tmp/overlap.csv")
In [38]:
Copied!
! wc -l /tmp/overlap.csv
! wc -l /tmp/overlap.csv
17882546 /tmp/overlap.csv