Initialize

Download sample data

[1]:
%%bash
mkdir -p data
cd data
wget --quiet http://biodatageeks.ii.pw.edu.pl/sequila/data/NA12878.multichrom.md.bam
wget --quiet http://biodatageeks.ii.pw.edu.pl/sequila/data/Homo_sapiens_assembly18_chr1_chrM.small.fasta
wget --quiet http://biodatageeks.ii.pw.edu.pl/sequila/data/Homo_sapiens_assembly18_chr1_chrM.small.fasta.fai
wget --quiet http://biodatageeks.ii.pw.edu.pl/sequila/data/tgp_exome_hg18.bed
wget --quiet http://biodatageeks.ii.pw.edu.pl/sequila/data/NA12878.multichrom.md.cram
wget --quiet http://biodatageeks.ii.pw.edu.pl/sequila/data/test.vcf
[2]:
### Set helper variables
[3]:
import os
base_path = f'{os.getcwd()}/data'

bam_path = f'{base_path}/NA12878.multichrom.md.bam'
cram_path = f'{base_path}/NA12878.multichrom.md.cram'
vcf_path =  f'{base_path}/test.vcf'
ref_path = f'{base_path}/Homo_sapiens_assembly18_chr1_chrM.small.fasta'
bed_path = f'{base_path}/tgp_exome_hg18.bed'
sample_id = 'NA12878'
table_name = 'reads'
app_name = 'sequila'

Initialize a PySeQuiLa session

[4]:
from pysequila import SequilaSession
import pandas as pd
pd.options.display.max_columns = None
pd.set_option('max_colwidth', None)


ss = SequilaSession \
    .builder \
    .getOrCreate()

Create a table

[5]:
ss.sql(f'''CREATE TABLE IF NOT EXISTS {table_name} \
         USING org.biodatageeks.sequila.datasources.BAM.BAMDataSource \
         OPTIONS(path "{bam_path}")''')
[5]:
DataFrame[]