File formats

Initialize SeQuiLaSession and download sample data (check Initialize section for details)

[1]:
%run initialize.ipynb

BAM

[2]:
ss.sql(f'''CREATE TABLE IF NOT EXISTS {table_name} \
         USING org.biodatageeks.sequila.datasources.BAM.BAMDataSource \
         OPTIONS(path "{bam_path}")''')
[2]:
DataFrame[]
[3]:
ss.sql(f'''SELECT distinct cigar,tag_MD FROM {table_name} LIMIT 5''').toPandas()
[3]:
cigar tag_MD
0 101M 26G26G47
1 76M 19C56
2 101M 53A47
3 95M6S 95
4 76M 18A23A33

CRAM

[4]:
ss.sql(f'''CREATE TABLE IF NOT EXISTS {table_name}_cram \
         USING org.biodatageeks.sequila.datasources.BAM.CRAMDataSource \
         OPTIONS(path "{cram_path}", refPath "{ref_path}")''')
[4]:
DataFrame[]
[5]:
ss.sql(f'''SELECT distinct cigar,tag_MD FROM {table_name}_cram LIMIT 5''').toPandas()
[5]:
cigar tag_MD
0 101M 26G26G47
1 76M 19C56
2 101M 53A47
3 95M6S 95
4 76M 18A23A33

VCF

[6]:
ss.sql(f'''CREATE TABLE IF NOT EXISTS variants \
         USING org.biodatageeks.sequila.datasources.VCF.VCFDataSource \
         OPTIONS(path "{vcf_path}" )''')
[6]:
DataFrame[]
[7]:
ss.sql(f'''SELECT contig, pos_start, pos_end, ref, alt FROM variants LIMIT 5''').toPandas()
[7]:
contig pos_start pos_end ref alt
0 20 14369 14370 G [A]
1 20 17329 17330 T [A]
2 20 1110695 1110696 A [G, T]
3 20 1230236 1230237 T None
4 20 1234566 1234569 GTC [G, GTCT]

BED

[8]:
ss.sql(f'''
    CREATE TABLE IF NOT EXISTS targets(contig String,pos_start Integer,pos_end Integer) \
    USING csv \
    OPTIONS (path "{bed_path}", delimiter "\t")''')

[8]:
DataFrame[]
[9]:
ss.sql(f'''SELECT replace(targets.contig,"chr","") as contig, pos_start, pos_end FROM targets LIMIT 5''').toPandas()
[9]:
contig pos_start pos_end
0 1 4505 4745
1 1 4806 4926
2 1 5614 5853
3 1 6462 6582
4 1 6697 6937
[ ]: