File formats
Initialize SeQuiLaSession and download sample data (check Initialize section for details)
[1]:
%run initialize.ipynb
BAM
[2]:
ss.sql(f'''CREATE TABLE IF NOT EXISTS {table_name} \
USING org.biodatageeks.sequila.datasources.BAM.BAMDataSource \
OPTIONS(path "{bam_path}")''')
[2]:
DataFrame[]
[3]:
ss.sql(f'''SELECT distinct cigar,tag_MD FROM {table_name} LIMIT 5''').toPandas()
[3]:
cigar | tag_MD | |
---|---|---|
0 | 101M | 26G26G47 |
1 | 76M | 19C56 |
2 | 101M | 53A47 |
3 | 95M6S | 95 |
4 | 76M | 18A23A33 |
CRAM
[4]:
ss.sql(f'''CREATE TABLE IF NOT EXISTS {table_name}_cram \
USING org.biodatageeks.sequila.datasources.BAM.CRAMDataSource \
OPTIONS(path "{cram_path}", refPath "{ref_path}")''')
[4]:
DataFrame[]
[5]:
ss.sql(f'''SELECT distinct cigar,tag_MD FROM {table_name}_cram LIMIT 5''').toPandas()
[5]:
cigar | tag_MD | |
---|---|---|
0 | 101M | 26G26G47 |
1 | 76M | 19C56 |
2 | 101M | 53A47 |
3 | 95M6S | 95 |
4 | 76M | 18A23A33 |
VCF
[6]:
ss.sql(f'''CREATE TABLE IF NOT EXISTS variants \
USING org.biodatageeks.sequila.datasources.VCF.VCFDataSource \
OPTIONS(path "{vcf_path}" )''')
[6]:
DataFrame[]
[7]:
ss.sql(f'''SELECT contig, pos_start, pos_end, ref, alt FROM variants LIMIT 5''').toPandas()
[7]:
contig | pos_start | pos_end | ref | alt | |
---|---|---|---|---|---|
0 | 20 | 14369 | 14370 | G | [A] |
1 | 20 | 17329 | 17330 | T | [A] |
2 | 20 | 1110695 | 1110696 | A | [G, T] |
3 | 20 | 1230236 | 1230237 | T | None |
4 | 20 | 1234566 | 1234569 | GTC | [G, GTCT] |
BED
[8]:
ss.sql(f'''
CREATE TABLE IF NOT EXISTS targets(contig String,pos_start Integer,pos_end Integer) \
USING csv \
OPTIONS (path "{bed_path}", delimiter "\t")''')
[8]:
DataFrame[]
[9]:
ss.sql(f'''SELECT replace(targets.contig,"chr","") as contig, pos_start, pos_end FROM targets LIMIT 5''').toPandas()
[9]:
contig | pos_start | pos_end | |
---|---|---|---|
0 | 1 | 4505 | 4745 |
1 | 1 | 4806 | 4926 |
2 | 1 | 5614 | 5853 |
3 | 1 | 6462 | 6582 |
4 | 1 | 6697 | 6937 |
[ ]: