Initialize
Download sample data
[1]:
%%bash
mkdir -p data
cd data
wget --quiet http://biodatageeks.ii.pw.edu.pl/sequila/data/NA12878.multichrom.md.bam
wget --quiet http://biodatageeks.ii.pw.edu.pl/sequila/data/Homo_sapiens_assembly18_chr1_chrM.small.fasta
wget --quiet http://biodatageeks.ii.pw.edu.pl/sequila/data/Homo_sapiens_assembly18_chr1_chrM.small.fasta.fai
wget --quiet http://biodatageeks.ii.pw.edu.pl/sequila/data/tgp_exome_hg18.bed
wget --quiet http://biodatageeks.ii.pw.edu.pl/sequila/data/NA12878.multichrom.md.cram
wget --quiet http://biodatageeks.ii.pw.edu.pl/sequila/data/test.vcf
[2]:
### Set helper variables
[3]:
import os
base_path = f'{os.getcwd()}/data'
bam_path = f'{base_path}/NA12878.multichrom.md.bam'
cram_path = f'{base_path}/NA12878.multichrom.md.cram'
vcf_path = f'{base_path}/test.vcf'
ref_path = f'{base_path}/Homo_sapiens_assembly18_chr1_chrM.small.fasta'
bed_path = f'{base_path}/tgp_exome_hg18.bed'
sample_id = 'NA12878'
table_name = 'reads'
app_name = 'sequila'
Initialize a PySeQuiLa session
[4]:
from pysequila import SequilaSession
import pandas as pd
pd.options.display.max_columns = None
pd.set_option('max_colwidth', None)
ss = SequilaSession \
.builder \
.getOrCreate()
Create a table
[5]:
ss.sql(f'''CREATE TABLE IF NOT EXISTS {table_name} \
USING org.biodatageeks.sequila.datasources.BAM.BAMDataSource \
OPTIONS(path "{bam_path}")''')
[5]:
DataFrame[]