#!/usr/bin/env python
#######################################################################
#######################################################################
# VAR-MD : Variant Analysis in Rare and Mendelian Diseases
# Developed by Murat Sincan (C) 2012
# msincan@gmail.com
#
# homepage: http://code.google.com/p/var-md/
# This program is described in:
# Sincan Et al. 2012 Human Mutation doi: 10.1002/humu.2203
#
# This file is part of VAR-MD
# VAR-MD is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see .
#######################################################################
#######################################################################
import argparse
from copy import deepcopy
import cProfile
from fractions import Fraction
import glob
import os
import string
import shutil
import shlex
import subprocess
import sys
import smtplib
import time
import re
import urllib
import urllib2
from HTMLParser import HTMLParser
import sqlite3
from sqlalchemy import create_engine
from sqlalchemy import func
from sqlalchemy import Table, Column, Integer, Float, String, MetaData, ForeignKey, Index, and_, or_, not_
from sqlalchemy.orm import mapper
if not (sys.version_info.major==2 and sys.version_info.minor==7):
print 'VAR-MD requires python 2.7, current python version is %s.%s'%(sys.version_info.major,sys.version_info.minor)
sys.exit(1)
startdate=time.ctime()
startall=time.time()
#-------------------------------------------------------------------------------
chrs=["chr"+str(x) for x in range(1,23)]
chrs.extend(["chrX","chrY","chrM"])
chrs_random=["chr"+str(x) for x in range(1,23)]
chrs_random.extend(["chrX","chrY","chrM","chrX_random","chrY_random","chrM_random"])
chrs_random.extend(["chr%s_random"%str(x) for x in range(1,23)])
def arguments():
global options
global args
parser = argparse.ArgumentParser(description = 'VAR-MD: Clinical and research analysis of \
next-gen sequencing data for Mendelian diseases in pedigrees')
parser.add_argument("--func",
dest="function_name",
default="usage",
help="Function name to be run")
parser.add_argument("--cleanup",
dest="cleanup", default = 'no',
help="--cleanup yes => program will cleanup intermideate/temporary files, default is no")
parser.add_argument("--database",
dest="database",
help="""define path\
to databases as --database [path_to_db]""")
parser.add_argument('--downdb', dest = 'downdb', default ='', help = 'downloads UCSC annotation tables')
parser.add_argument('--buildver',dest = 'buildver', default = 'hg19',
help = 'specify a valid genome build (hg18, hg19)')
parser.add_argument('--dbdir',dest = 'dbdir', default = 'dbdir',
help = 'database directory where the public annoatation tables reside')
parser.add_argument('--input', dest = 'input_file', default = '',
help = 'path for the input file to be processed')
parser.add_argument('--config', dest = 'config_file', default = os.path.join(os.path.dirname(sys.argv[0]),'varmd.cfg'),
help = 'absolute path for the configuration file')
parser.add_argument('--projects_metadata', dest = 'projects_metadata_file', default = '',
help = 'absolute path for the projects metadata file')
parser.add_argument('--output', dest = 'output_file', default = '',
help = 'path for the output file to be generated')
parser.add_argument('--output_snv', dest = 'output_snv', default = '',
help = 'path for the output file for SeattleSeq SNV, must have .ss_snv extention')
parser.add_argument('--output_indel', dest = 'output_indel', default = '',
help = 'path for the output file for SeattleSeq indels, must have .ss_indel extention')
parser.add_argument('--liftover', dest = 'liftover', default = '',
help = 'path for the output bed file that contains the coordinates\
generated by liftover program')
parser.add_argument('--outdir', dest = 'output_dir', default = '',
help = 'path for the output_dir')
parser.add_argument('--include_bed_file', dest = 'include_bed_file', default = 'None',
help = 'path for the include_bed_file')
parser.add_argument('--linkage_bed_file', dest = 'linkage_bed_file', default = 'None',
help = 'path for the linkage_bed_file')
parser.add_argument('--cnv_bed_file', dest = 'cnv_bed_file', default = 'None',
help = 'path for the cnv_bed_file')
parser.add_argument('--annotation_file', dest = 'annotation_file', default = 'None',
help = 'path for the intervals based annotations file')
parser.add_argument('--reference', dest = 'reference', default = '',
help = 'path for the reference genome fasta file')
parser.add_argument('--version', action = 'version', version = '0.1',
help = 'Prints version')
parser.add_argument('--project_code', dest = 'project_code', default = '', help = 'project code to be used in the analysis')
parser.add_argument('--allow_missing', action = 'store_true', help = 'allow missing samples from data')
parser.add_argument('--extract_subject_only', action = 'store_true',
help = 'extract only the subject (do not include data for other subjects in the output) when used with split_allfams_bysample')
parser.add_argument('--nextgene_data', action = 'store_true', help = 'data analyzed is generated by next gene: applies special criteria to accomodate NextGene variant reports')
parser.add_argument('--verbose', action = 'store_true', help = 'sets verbosity')
parser.add_argument('args', metavar = 'args', nargs = '*', help = 'all unnamed positional arguments')
args = parser.parse_args()
#print args
#print args.function_name
#time.sleep(10000)
return args
"""
codon1 = {TTT:"F", TTC:"F", TCT:"S", TCC:"S", TAT:"Y", TAC:"Y", TGT:"C", TGC:"C",
TTA:"L", TCA:"S", TAA:"*", TGA:"*", TTG:"L", TCG:"S", TAG:"*", TGG:"W", CTT:"L",
CTC:"L", CCT:"P", CCC:"P", CAT:"H", CAC:"H", CGT:"R", CGC:"R", CTA:"L", CTG:"L",
CCA:"P", CCG:"P", CAA:"Q", CAG:"Q", CGA:"R", CGG:"R", ATT:"I", ATC:"I", ACT:"T",
ACC:"T", AAT:"N", AAC:"N", AGT:"S", AGC:"S", ATA:"I", ACA:"T", AAA:"K", AGA:"R",
ATG:"M", ACG:"T", AAG:"K", AGG:"R", GTT:"V", GTC:"V", GCT:"A", GCC:"A", GAT:"D",
GAC:"D", GGT:"G", GGC:"G", GTA:"V", GTG:"V", GCA:"A", GCG:"A", GAA:"E", GAG:"E",
GGA:"G", GGG:"G"}
codon3 = {TTT:"Phe", TTC:"Phe", TCT:"Ser", TCC:"Ser", TAT:"Tyr", TAC:"Tyr",
TGT:"Cys", TGC:"Cys", TTA:"Leu", TCA:"Ser", TAA:"*", TGA:"*", TTG:"Leu",
TCG:"Ser", TAG:"*", TGG:"Trp", CTT:"Leu", CTC:"Leu", CCT:"Pro", CCC:"Pro",
CAT:"His", CAC:"His", CGT:"Arg", CGC:"Arg", CTA:"Leu", CTG:"Leu", CCA:"Pro",
CCG:"Pro", CAA:"Gln", CAG:"Gln", CGA:"Arg", CGG:"Arg", ATT:"Ile", ATC:"Ile",
ACT:"Thr", ACC:"Thr", AAT:"Asn", AAC:"Asn", AGT:"Ser", AGC:"Ser", ATA:"Ile",
ACA:"Thr", AAA:"Lys", AGA:"Arg", ATG:"Met", ACG:"Thr", AAG:"Lys", AGG:"Arg",
GTT:"Val", GTC:"Val", GCT:"Ala", GCC:"Ala", GAT:"Asp", GAC:"Asp", GGT:"Gly",
GGC:"Gly", GTA:"Val", GTG:"Val", GCA:"Ala", GCG:"Ala", GAA:"Glu", GAG:"Glu",
GGA:"Gly", GGG:"Gly"}
codonfull = {TTT:"Phenylalanine", TTC:"Phenylalanine", TCT:"Serine", TCC:"Serine",
TAT:"Tyrosine", TAC:"Tyrosine", TGT:"Cysteine", TGC:"Cysteine", TTA:"Leucine",
TCA:"Serine", TAA:"Stop", TGA:"Stop", TTG:"Leucine", TCG:"Serine", TAG:"Stop",
TGG:"Tryptophan", CTT:"Leucine", CTC:"Leucine", CCT:"Proline", CCC:"Proline",
CAT:"Histidine", CAC:"Histidine", CGT:"Arginine", CGC:"Arginine", CTA:"Leucine",
CTG:"Leucine", CCA:"Proline", CCG:"Proline", CAA:"Glutamine", CAG:"Glutamine",
CGA:"Arginine", CGG:"Arginine", ATT:"Isoleucine", ATC:"Isoleucine", ACT:"Threonine",
ACC:"Threonine", AAT:"Asparagine", AAC:"Asparagine", AGT:"Serine", AGC:"Serine",
ATA:"Isoleucine", ACA:"Threonine", AAA:"Lysine", AGA:"Arginine", ATG:"Methionine",
ACG:"Threonine", AAG:"Lysine", AGG:"Arginine", GTT:"Valine", GTC:"Valine",
GCT:"Alanine", GCC:"Alanine", GAT:"Aspartic acid", GAC:"Aspartic acid", GGT:"Glycine",
GGC:"Glycine", GTA:"Valine", GTG:"Valine", GCA:"Alanine", GCG:"Alanine",
GAA:"Glutamic acid", GAG:"Glutamic acid", GGA:"Glycine", GGG:"Glycine"}
codonr1 = {UUU:"F", UUC:"F", UCU:"S", UCC:"S", UAU:"Y", UAC:"Y", UGU:"C", UGC:"C",
UUA:"L", UCA:"S", UAA:"*", UGA:"*", UUG:"L", UCG:"S", UAG:"*", UGG:"W", CUU:"L",
CUC:"L", CCU:"P", CCC:"P", CAU:"H", CAC:"H", CGU:"R", CGC:"R", CUA:"L", CUG:"L",
CCA:"P", CCG:"P", CAA:"Q", CAG:"Q", CGA:"R", CGG:"R", AUU:"I", AUC:"I", ACU:"T",
ACC:"T", AAU:"N", AAC:"N", AGU:"S", AGC:"S", AUA:"I", ACA:"T", AAA:"K", AGA:"R",
AUG:"M", ACG:"T", AAG:"K", AGG:"R", GUU:"V", GUC:"V", GCU:"A", GCC:"A", GAU:"D",
GAC:"D", GGU:"G", GGC:"G", GUA:"V", GUG:"V", GCA:"A", GCG:"A", GAA:"E", GAG:"E",
GGA:"G", GGG:"G"}
codonr3 = {UUU:"Phe", UUC:"Phe", UCU:"Ser", UCC:"Ser", UAU:"Tyr", UAC:"Tyr",
UGU:"Cys", UGC:"Cys", UUA:"Leu", UCA:"Ser", UAA:"*", UGA:"*", UUG:"Leu",
UCG:"Ser", UAG:"*", UGG:"Trp", CUU:"Leu", CUC:"Leu", CCU:"Pro", CCC:"Pro",
CAU:"His", CAC:"His", CGU:"Arg", CGC:"Arg", CUA:"Leu", CUG:"Leu", CCA:"Pro",
CCG:"Pro", CAA:"Gln", CAG:"Gln", CGA:"Arg", CGG:"Arg", AUU:"Ile", AUC:"Ile",
ACU:"Thr", ACC:"Thr", AAU:"Asn", AAC:"Asn", AGU:"Ser", AGC:"Ser", AUA:"Ile",
ACA:"Thr", AAA:"Lys", AGA:"Arg", AUG:"Met", ACG:"Thr", AAG:"Lys", AGG:"Arg",
GUU:"Val", GUC:"Val", GCU:"Ala", GCC:"Ala", GAU:"Asp", GAC:"Asp", GGU:"Gly",
GGC:"Gly", GUA:"Val", GUG:"Val", GCA:"Ala", GCG:"Ala", GAA:"Glu", GAG:"Glu",
GGA:"Gly", GGG:"Gly"}
codonrfull = {UUU:"Phenylalanine", UUC:"Phenylalanine", UCU:"Serine", UCC:"Serine",
UAU:"Tyrosine", UAC:"Tyrosine", UGU:"Cysteine", UGC:"Cysteine", UUA:"Leucine",
UCA:"Serine", UAA:"Stop", UGA:"Stop", UUG:"Leucine", UCG:"Serine", UAG:"Stop",
UGG:"Tryptophan", CUU:"Leucine", CUC:"Leucine", CCU:"Proline", CCC:"Proline",
CAU:"Histidine", CAC:"Histidine", CGU:"Arginine", CGC:"Arginine", CUA:"Leucine",
CUG:"Leucine", CCA:"Proline", CCG:"Proline", CAA:"Glutamine", CAG:"Glutamine",
CGA:"Arginine", CGG:"Arginine", AUU:"Isoleucine", AUC:"Isoleucine", ACU:"Threonine",
ACC:"Threonine", AAU:"Asparagine", AAC:"Asparagine", AGU:"Serine", AGC:"Serine",
AUA:"Isoleucine", ACA:"Threonine", AAA:"Lysine", AGA:"Arginine", AUG:"Methionine",
ACG:"Threonine", AAG:"Lysine", AGG:"Arginine", GUU:"Valine", GUC:"Valine",
GCU:"Alanine", GCC:"Alanine", GAU:"Aspartic acid", GAC:"Aspartic acid", GGU:"Glycine",
GGC:"Glycine", GUA:"Valine", GUG:"Valine", GCA:"Alanine", GCG:"Alanine",
GAA:"Glutamic acid", GAG:"Glutamic acid", GGA:"Glycine", GGG:"Glycine"}
iupac = {R:'AG', Y:'CT', S:'GC', W:'AT', K:'GT', M:'AC', A:'AA', C:'CC', G:'GG', T:'TT',
B:'CGT', D:'AGT', H:'ACT', V:'ACG', N:'ACGT', '.':'-', '-':'-'}
threeletter = {"ala":["alanine","A"] ,"arg":["arginine","R"], "asn": ["asparagine","N"],
"asp":["aspartic acid","D"] ,
asparagine or aspartic acid asx B
cysteine cys C
glutamic acid glu E
glutamine gln Q
glutamine or glutamic acid
glx
Z
glycine
gly
G
histidine
his
H
isoleucine
ile
I
leucine
leu
L
lysine
lys
K
methionine
met
M
phenylalanine
phe
F
proline
pro
P
serine
ser
S
threonine
thr
T
tryptophan
trp
W
tyrosine
tyr
Y
valine
val
V
"""
class config(object):
debug=0
def __init__(self,configfile):
assert os.path.exists(configfile),"""ERROR:Configuration file not found, varmd.cfg
must exist in the same directory with the varmd.py"""
self.configfile = open(configfile)
self.inputfiles={}
self.input_bam_files={}
self.input_include_bed_files={}
self.input_cnv_bed_files={}
self.input_include_linkage_bed_files={}
for self.item in self.configfile:
if not self.item.strip():
continue
else:
self.tl=self.item.strip().split()
if self.tl[0][0] != '#':
if self.tl[0] == 'input_file':
if not self.inputfiles.get(self.tl[1]):
self.inputfiles[self.tl[1]] = {}
self.inputfiles[self.tl[1]] = self.tl[2]
else:
self.inputfiles[self.tl[1]] = self.tl[2]
elif self.tl[0]=='input_bam_file':
if not self.input_bam_files.get(self.tl[1]):
#self.input_bam_files[self.tl[1]]=[{self.tl[2]:self.tl[3]},]
self.input_bam_files[self.tl[1]]={}
self.input_bam_files[self.tl[1]][self.tl[2]]=self.tl[3]
else:
self.input_bam_files[self.tl[1]][self.tl[2]]=self.tl[3]
elif self.tl[0]=='input_include_bed_file':
if not self.input_include_bed_files.get(self.tl[1]):
self.input_include_bed_files[self.tl[1]]=self.tl[2]
else:
raise Exception('There are multiple entries for include bed files,\
please consolidate your coordinates')
elif self.tl[0]=='input_include_linkage_bed_file':
if not self.input_include_linkage_bed_files.get(self.tl[1]):
self.input_include_linkage_bed_files[self.tl[1]]=self.tl[2]
else:
raise Exception('There are multiple entries for include linkage\
bed files, please consolidate your coordinates')
elif self.tl[0]=='input_cnv_bed_file':
if not self.input_cnv_bed_files.get(self.tl[1]):
self.input_cnv_bed_files[self.tl[1]]=self.tl[2]
else:
raise Exception('There are multiple entries for include linkage\
bed files, please consolidate your coordinates')
elif self.tl[0]=='interval':
self.interval=self.tl[1]
elif self.tl[0]=='nodetype':
self.nodetype=self.tl[1]
elif self.tl[0]=='projects_path':
self.projects_path=self.tl[1]
elif self.tl[0]=='population_file':
self.population_file=self.tl[1]
elif self.tl[0]=='seattleseq_dir_path':
self.seattleseq_dir_path = self.tl[1]
elif self.tl[0]=='genesToExclude':
self.genes_to_exclude_path=self.tl[1]
elif self.tl[0]=='cdpred_cutoff':
self.cdpred_cutoff=self.tl[1]
elif self.tl[0]=='cdpred_cutoff':
self.cdpred_cutoff=self.tl[1]
elif self.tl[0]=='send_mail_to':
self.send_mail_to=self.tl[1]
elif self.tl[0]=='liftOver_path':
self.liftOver_path=self.tl[1]
elif self.tl[0]=='liftOver_chain_path':
self.liftOver_chain_path=self.tl[1]
elif self.tl[0]=='dbsnp_vcf_path':
self.dbsnp_vcf_path = self.tl[1]
elif self.tl[0]=='dbsnp_db_path':
self.dbsnp_db_path = self.tl[1]
elif self.tl[0]=='population_db_path':
self.population_db_path = self.tl[1]
elif self.tl[0]=='expression_snps_path':
self.expression_snps_path = self.tl[1]
elif self.tl[0]=='chrominfohg19_path':
self.chrominfohg19_path = self.tl[1]
elif self.tl[0]=='hg18fasta':
self.hg18fasta = self.tl[1]
elif self.tl[0]=='dbconnect':
self.dbconnect = self.tl[1]
else:
if self.debug==1:
print 'This configuration entry is not used',self.tl
class varfile(object):
def __init__(self, htl):
"""
input: file name
logic: make sure all file names are in the same format,
otherwise mistakes will occur with affected status function
.fam_members
.num_affected
.affected
"""
#-------------------------------------------------------------------------------
self.first_sample_index = 0
for x in htl:
if x[-3:]=='.NA':
self.first_sample_index = htl.index(x)
break
self.total_num_samples = (len(htl)-self.first_sample_index)/3
assert float(self.total_num_samples)%1 == 0.0
#-------------------------------------------------------------------------------
def get_project_members(self, project_code):
"""
reads projects_data.txt file generated using data from labmatrix and creates a dictionary of
project_members
"""
self.project_members={}
if args.projects_metadata_file != '':
projects_metadata_file = args.projects_metadata_file
else:
projects_metadata_file = mycfg.projects_path
prj_lines = open(projects_metadata_file).readlines()
prj_htl = prj_lines[0].strip().split('\t')
for line in prj_lines[1:]:
if not line.startswith('#'):
tl = line.strip().split('\t')
if tl[prj_htl.index('Fam_id')] == project_code:
self.project_members[tl[prj_htl.index('subject_code')]] = {}
self.project_members[tl[prj_htl.index('subject_code')]]['mom'] = tl[prj_htl.index('mom')]
self.project_members[tl[prj_htl.index('subject_code')]]['dad'] = tl[prj_htl.index('dad')]
self.project_members[tl[prj_htl.index('subject_code')]]['Fam_id'] = tl[prj_htl.index('Fam_id')]
self.project_members[tl[prj_htl.index('subject_code')]]['sex'] = tl[prj_htl.index('sex')]
self.project_members[tl[prj_htl.index('subject_code')]]['affected'] = tl[prj_htl.index('affected')]
self.project_members[tl[prj_htl.index('subject_code')]]['alive'] = tl[prj_htl.index('alive')]
self.project_members[tl[prj_htl.index('subject_code')]]['founder'] = tl[prj_htl.index('founder_genotype_in_pedigree')]
try:
self.project_members[tl[prj_htl.index('subject_code')]]['birth_year'] = tl[prj_htl.index('birth_year')]
except IndexError:
self.project_members[tl[prj_htl.index('subject_code')]]['birth_year'] = "NA"
try:
self.project_members[tl[prj_htl.index('subject_code')]]['date_sent_to_nisc'] = tl[prj_htl.index('Date_sent_to_NISC')]
except IndexError:
self.project_members[tl[prj_htl.index('subject_code')]]['date_sent_to_nisc'] = "NA"
if self.project_members == {}:
print 'NOTICE: Project_code = %s :No data was found in the projects_data file get_fam_members()'%project_code
if not project_code:
print "No project code was given"
return self.project_members
def get_num_affected(self, project_code):
self.num_affected=0
self.project_members = self.get_project_members(project_code)
for member,data in self.project_members.iteritems():
assert data['affected'] in ['1','0'], """The affected status is not 0\
or 1, the value found for affected status is >>" %s "<<, Please correct\
the value in %s"""%(data['affected'],
projects_metadata_path)
if data['affected'] == '1':
self.num_affected += 1
return self.num_affected
def downdb():
"""
downloads UCSC annotations files
"""
if args.downdb.lower()=='knowngene':
dbname = 'knownGene'
sys.stderr.write('using %s to specify use --buildver \n'%args.buildver)
command = 'wget ftp://hgdownload.cse.ucsc.edu/goldenPath/%s/database/%s.txt.gz'%(args.buildver, dbname)
subprocess.call(shlex.split(command))
command = 'mv %s.txt.gz %s/'%(dbname, args.dbdir)
subprocess.call(shlex.split(command))
print 'file downloaded to %s'%args.dbdir
def sendmail():
enddate=time.ctime()
endall=time.time()
pathname, scriptname = os.path.split(sys.argv[0])
Msg='Script name:%s \nFunction name: %s\n\narguments:%s\n\n'\
'finished execution, please check output.\n'%(scriptname,
args.function_name,
args)
p = os.popen("/usr/sbin/sendmail -t", "w")
p.write("From: sincanm@mail.nih.gov\n")
p.write("To: %s\n"%mycfg.send_mail_to)
p.write("Subject: program execution status\n")
p.write("Start date:%s\nEnd date:%s\n"%(startdate,enddate))
p.write("Elapsed time is %.3s min\n"%((endall-startall)/60))
p.write(Msg)
p.close()
def usage():
print """This program takes function names and other parameters to run.
--func available function names are
usage
split_by_chrs,
prep_ss,
annotate_with,
annotate_with_allfams,
annotate_with_ss_snv,
annotate_with_ss_indel,
scandirs_cdpred,
mendelian_filters,
mendelian_filters_com_het_res,
annotate_with_mendelian_filters,
reduce_annotate_with,
create_allsamples_db,
update_allfams,
update_local_freq_file,
dbsnp_vcf2db,
prepare_liftover_input,
add_hg19,
reduce_addhg19,
run_analysis
run_analysis_single_file
)
--downdb [knowngene,]
--buildver (default: hg19)
Please contact sincanm@mail.nih.gov for instructions.
"""
def coords_19(interval):
"""Returns a list with each item containing
[chr,start,stop positions] based on the interval given"""
test=0
chrinfolist=[]
for line in open(mycfg.chrominfohg19_path):
tl=line.strip().split('\t')
if tl[0] in chrs:
chrinfolist.append(tl[:2])
if test:
print '*** chrinfolist',chrinfolist
if type(interval)==str:
interval=int(interval)
global coords19
coords19=[]
for item in chrinfolist:
coords19.append([item[0],0,interval])
while int(item[1])>coords19[-1][2]:
coords19.append([item[0],int(coords19[-1][2]+1),int(coords19[-1][2])+interval])
if test:
print '***coords19',coords19
return coords19
def write_lines(path1,linelist):
file=open(path1,"w")
file.writelines(linelist)
file.close()
def get_ptcode(headerline,p_pos):
"""
Reads the header line that is provided in the function call and return the
family codes based on the position provided in the initial variants for
positions
"""
test=0
global pt_code
pt_code=""
tl=headerline.strip().split("\t")
try:
if tl[p_pos][-3:]==".NA":
pt_code=tl[p_pos][:-3]
elif tl[p_pos][-3:]!=".NA":
pt_code=tl[p_pos]
except Exception:
print ('headerline in get genotypes()',headerline)
print ('tl without[p_pos] at get_ptcode',tl)
print ('p_pos in get_ptcode()',p_pos)
if test:
print "tl{p_pos] is %s"%tl[p_pos]
print "tl{p_pos][:-3] is %s"%tl[p_pos][:-3]
print "pt_code is %s"%pt_code
return pt_code
def p_ids():
"""
input:none, uses projects metadata file (argument or configuration file entry)
output: patient_ids(old) and family_data(new, udp code as key and
all the rest of family data file info as values
"""
test=0
#redundant global variable decleratiosn as the return values are assigned to variables now.
#cruft delete from code
#global patient_ids
#global family_data
patient_ids={}
family_data={}
try:
projects_lines = open(mycfg.projects_path).readlines()
htl = projects_lines[0].strip().split("\t")
#index Fam_id subject_code sex affected mom dad alive birth_year date_sent_to_NISC
for line in projects_lines[1:]:
if test:
print line
time.sleep(1)
if not line.startswith('#'):
tl=line.strip().split('\t')
patient_ids[tl[htl.index("subject_code")]]=tl[htl.index("index")]
family_data[tl[htl.index("subject_code")]]=tl[htl.index("sex"):]
except Exception:
print "%s could not be found"%mycfg.projects_path
sys.exit(1)
if test:
print 'patient ids:',patient_ids, 'len patient ids:',len(patient_ids)
print '\n\nfamily data:',family_data
time.sleep(1)
return patient_ids, family_data
def mpg_scores( tl, htl):
"""
input: a varsifter line, the file the line comes from
initial score is 1
each mpg/coverage for an individual is evaluated according to
the formula for a penalty of :(1/number of individuals in the family)
1-penalty is the final score
if the mpg score or the coverage is < 10 there is a penalty
if the coverage is between 10-20 we require the mpg to be at least half of the coverage
if the coverage is more than 20 than the mpg can be as low as 20% of the coverage
returns a float between 0 and 1.0 , the less the number
the worse the mpg_score evaluation is
"""
test= False
if args.nextgene_data: cutoff= 7
else:cutoff=10.0
if args.nextgene_data: validcutoff = 14.0
else: validcutoff=20.0
if args.nextgene_data: badmpgratiocutoff = 0.2
else: badmpgratiocutoff = 0.5
if args.nextgene_data: goodmpgratiocutoff = 0.1
else: goodmpgratiocutoff=0.15
final_mpg_score = Fraction(1,1)
myfileobj = varfile(htl)
pos = myfileobj.first_sample_index
num_pts=(len(tl)-(pos))/3
#print 'pos returned by myfileobj.first_sample_index is =>',pos
#print 'data at that position and -3:10 ',tl[pos-3:pos+10]
#print 'num_pts',num_pts
#time.sleep(100)
for mem in xrange(num_pts):
mpgpos = pos+((mem*3)+1)
coveragepos = pos+((mem*3)+2)
try:
float(tl[mpgpos].replace('NA','0'))
except ValueError, TypeError:
sys.stderr.write('mpgpos =%s, tl[mpgpos]=%s \n'%(mpgpos,tl[mpgpos]))
sys.stderr.write('mpg column doesn\'t contain numeric data ay lineindex %s\n'%tl[htl.index('Index')])
sys.stderr.write('Exiting program, please correct the data\n')
raise Exception
if (not tl[coveragepos].isdigit()) and tl[coveragepos] != "NA":
sys.stderr.write("coverage column doesn't contain numeric data\n")
sys.stderr.write("coverage pos = %s\n"%coveragepos)
sys.stderr.write("tl[coveragepos] = %s\n"%tl[coveragepos])
sys.exit()
mpg_score = float(tl[mpgpos].replace("NA","0"))
coverage = float(tl[coveragepos].replace("NA","0"))
if test:
print 'mpg_score',mpg_score
print 'coverage',coverage
if (mpg_score <= cutoff) or (coverage <= cutoff):
final_mpg_score -= Fraction(1, num_pts)
elif cutoff < mpg_score < validcutoff:
if (mpg_score /coverage) <= badmpgratiocutoff:
#print 'after else',tl[pos+1],tl[pos+2]
#print 'score is bad',line
#print '1',final_mpg_score
final_mpg_score -= Fraction(1, num_pts)
#print '2',final_mpg_score
elif ( mpg_score / coverage) > badmpgratiocutoff:
#print 'after 2.if',tl[pos+1],tl[pos+2]
pass
#print 'score is good',line
#time.sleep(1)
elif mpg_score >= validcutoff:
pass
#if (mpg_score /coverage) <= goodmpgratiocutoff:
#print 'after else',tl[pos+1],tl[pos+2]
#print 'score is bad',line
#print '1',final_mpg_score
#final_mpg_score -= Fraction(1, num_pts)
#print '2',final_mpg_score
#elif ( mpg_score / coverage) > goodmpgratiocutoff:
#print 'after 2.if',tl[pos+1],tl[pos+2]
#pass
#print 'score is good',line
#time.sleep(1)
else:
raise Exception
#print 'mpg score value calculated in the final_mpg_scores function is =>',final_mpg_score
#print 'sleeping 2 secs'
#time.sleep(2)
if not (0 <= float(final_mpg_score) <= 1.0):
print 'final_mpg_score is not within defined limits'
print 'final_mpg_score = %s'%float(final_mpg_score)
print 'number of total samples is %s'%num_pts
print 'line is', line
raise Exception
return float(final_mpg_score)
def get_genotypes(tl1,htl,myaffected, first_sample_index, family_data):
"""
This function gets a line and a headerline as input,
splits it by tab and determines the genotype of individuals
returns a dictionary with UDP codes as keys and pt raw genotypes and
pt calculated genotypes as values
gts [0]= patient genotype with two alleles
gts [1]= patient calculated genotype class (hom_ref,
hom_var,
het_complete,
var_incoplete,
ref_incomplete,
novel_mutation,
ref_novel_het,
var_novel_het,
NA )
gts [2]= mpg score
gts [3]= coverage
#the rest comes from family_data returned by p_ids()
gts [4]= sex
gts [5]= affected
gts [6]= mom udpcode
gts [7]= dad udpcode
gts[8]=alive[0,1]
gts[9]= founder_in_pedigree
gts[10]= birth year
gts[11]= date sent to NISC
"""
test=0
if test:
print 'tl1',tl1
print 'htl',htl
print 'myaffected',myaffected
print 'first_sample_index',first_sample_index
print 'this is tl [first_sample_index-5:first_sample_index+5]',tl1[int(first_sample_index)-5:int(first_sample_index)+5]
print 'tl[first_sample_index-1] ',tl1[first_sample_index-1]
print 'tl[first_sample_index] ',tl1[first_sample_index]
print 'tl[first_sample_index+1] ',tl1[first_sample_index+1]
#for x in htl:
# print htl.index(x), x, tl1[htl.index(x)]
#print 'family_data',family_data
gts={}
if len(tl1) != len(htl):
sys.stderr.write('tl != htl len(tl)= %s len(htl)=%s\n'%(len(tl1), len(htl)))
sys.stderr.write('tl= %s\n'%tl1)
sys.stderr.write('htl= %s\n'%htl)
raise Exception
pos = first_sample_index
if test:
print 'this is pos as a result of using myvarfile.first_sample_index with htl ',pos
print 'this is tl [pos-5:pos+5]',tl1[int(pos)-5:int(pos)+5]
print 'This is the item at tl[pos]',tl1[pos]
print '\nline in get_genotypoes() is ',tl1
print 'patient data in line in get_genotypes_ is ', tl1[pos-1:]
print 'number of patients determined by get_genotypes_() is ', len(myaffected)
time.sleep(1)
for key in myaffected.keys():
if test:
print 'key in myaffected before pop', key
if not '%s.NA'%key in htl:
print 'popping key %s'%key
print htl[40:]
if test:
'key %s popped from final list'%key
myaffected.pop(key)
if test:
print 'len myaffected', len(myaffected)
for k,v in myaffected.iteritems():
print 'k,v in myaffected',k,v
for sample_code,v in myaffected.iteritems():
if test:
print 'sample_code, v',sample_code, v
#print 'in get_genotypes_ function: pt_code is => %s'%sample_code
#time.sleep(1)
#print "data dictionary = 1:hom ref,2:hom var, 3:het complete,\
#4:ref incomplete,5:var incompelete, 6:single novel mutation,\
#7:ref novel het, 8:var novel het , 9:hom_novel"
gt_class=""
if len(tl1[htl.index('%s.NA'%sample_code)])<2:
if test:
print "found incomplete genotype",tl1
print tl1[htl.index('%s.NA'%sample_code)][allele]
#if tl1[htl.index('%s.NA'%sample_code)]==tl1[myfilespecs["var_allele"]]:
if tl1[htl.index('%s.NA'%sample_code)] == tl1[htl.index('var_allele')]:
gt_class="var_incomplete"
#elif tl1[htl.index('%s.NA'%sample_code)]==tl1[myfilespecs["ref_allele"]]:
elif tl1[htl.index('%s.NA'%sample_code)] == tl1[htl.index('ref_allele')]:
gt_class="ref_incomplete"
elif (tl1[htl.index('%s.NA'%sample_code)] != tl1[htl.index("ref_allele")] and
tl1[htl.index('%s.NA'%sample_code)] != tl1[htl.index("var_allele")]):
gt_class="novel_mutation"
else:
print "something wrong check the row",tl1
#time.sleep(1)
elif len(tl1[htl.index('%s.NA'%sample_code)])==2:
varcount=0
refcount=0
novelcount=0
if tl1[htl.index('%s.NA'%sample_code)]=="NA":
gt_class="NA"
else:
#print "this is a valid genotype =>",tl1[htl.index('%s.NA'%sample_code)]
#print tl1[htl.index('%s.NA'%sample_code)]
#print tl1[htl.index('var_allele']]
for allele in xrange(2):
#print tl1[htl.index('%s.NA'%sample_code)][allele]
if tl1[htl.index('%s.NA'%sample_code)][allele]==tl1[htl.index("var_allele")]:
varcount+=1
elif tl1[htl.index('%s.NA'%sample_code)][allele]==tl1[htl.index("ref_allele")]:
refcount+=1
elif (tl1[htl.index('%s.NA'%sample_code)][allele]!=tl1[htl.index("ref_allele")] and
tl1[htl.index('%s.NA'%sample_code)][allele]!=tl1[htl.index("var_allele")]):
novelcount+=1
#a=freq[tl1[htl.index('%s.NA'%sample_code)][allele]]
#print a[0],a[1]
#freql.append(a[1]+1)
#freq[tl1[htl.index('%s.NA'%sample_code)][allele]]=[gt_id[0][1],a[1]+1]
#print "varcount",varcount
#print "ref count",refcount
if varcount==2:
gt_class="hom_var"
elif refcount==2:
gt_class="hom_ref"
elif novelcount==2:
gt_class="hom_novel"
elif refcount==1 and varcount==1:
gt_class="het_complete"
elif refcount==1 and novelcount==1:
gt_class="ref_novel_het"
elif varcount==1 and novelcount==1:
gt_class="var_novel_het"
else:
print "something wrong check the row",tl1[a:a+3]
#print 'line is', line
#print 'calculated genotype class is',gt_class
#time.sleep(1)
elif len(tl1[htl.index('%s.NA'%sample_code)])>2:
if ":" in tl1[htl.index('%s.NA'%sample_code)]:
varcount=0
refcount=0
novelcount=0
#print 'found a double colon seperated one',tl1
for allele in xrange(2):
#print "allele = %s "%allele
allele_gt=""
if allele==0:
allele_gt=tl1[htl.index('%s.NA'%sample_code)][:(tl1[htl.index('%s.NA'%sample_code)].index(":"))]
elif allele==1:
allele_gt=tl1[htl.index('%s.NA'%sample_code)][(tl1[htl.index('%s.NA'%sample_code)].index(":"))+1:]
else:
print "allele gt could not be determined"
print "allele_gt is:",allele_gt
#print tl1[htl.index('%s.NA'%sample_code)][allele]
if allele_gt==tl1[htl.index("ref_allele")]:
refcount+=1
elif allele_gt==tl1[htl.index("var_allele")]:
varcount+=1
elif allele_gt!=tl1[htl.index("ref_allele")] and allele_gt!=tl1[htl.index("var_allele")]:
novelcount+=1
else:
print "something wrong with determining the genotype counts , check the row",tl1[a:a+3]
if varcount==2:
gt_class="hom_var"
elif refcount==2:
gt_class="hom_ref"
elif novelcount==2:
gt_class="hom_novel"
elif refcount==1 and varcount==1:
gt_class="het_complete"
elif refcount==1 and novelcount==1:
gt_class="ref_novel_het"
elif varcount==1 and novelcount==1:
gt_class="var_novel_het"
else:
print "ref gt=",tl1[htl.index("ref_allele")],len(tl1[htl.index("ref_allele")])
print "var gt=",tl1[htl.index("var_allele")],len(tl1[htl.index("var_allele")])
print "patient gt=",tl1[htl.index('%s.NA'%sample_code)],tl1[htl.index('%s.NA'%sample_code)][:(tl1[htl.index('%s.NA'%sample_code)].index(":"))],len(tl1[htl.index('%s.NA'%sample_code)][:(tl1[htl.index('%s.NA'%sample_code)].index(":"))]),tl1[htl.index('%s.NA'%sample_code)][(tl1[htl.index('%s.NA'%sample_code)].index(":"))+1:],len(tl1[htl.index('%s.NA'%sample_code)][(tl1[htl.index('%s.NA'%sample_code)].index(":"))+1:])
print "rc",refcount,"vc",varcount,"nc",novelcount
print "something wrong determining gt class variables, check the row",tl1[a:a+3]
#print ">2 freq1",freq1
#time.sleep(1)
else:
if tl1[htl.index('%s.NA'%sample_code)]==tl1[htl.index("var_allele")]:
gt_class="hom_var"
elif tl1[htl.index('%s.NA'%sample_code)]==tl1[htl.index("ref_allele")]:
gt_class="hom_ref"
elif tl1[htl.index('%s.NA'%sample_code)]!=tl1[htl.index("ref_allele")] and tl1[htl.index('%s.NA'%sample_code)]!=tl1[htl.index("var_allele")]:
gt_class="het_complete"
else:
print "something wrong check the row",tl1[a:a+3],tl1[htl.index('%s.NA'%sample_code)]
else:
print 'row is not 2 or more characters long check row',tl1
#read family data table in to a dictionary
#patient_ids, family_data = p_ids()
#print 'pt_code in get genotypes() function is =>',sample_code
gts[sample_code] = [tl1[htl.index('%s.NA'%sample_code)],
gt_class,
tl1[htl.index('%s.NA'%sample_code)+1],
tl1[htl.index('%s.NA'%sample_code)+2]] + family_data[sample_code]
"""
print 'could not get value into gts dictionary, family data may be'\
'inconsistent with the variant report file make sure the patient'\
'codes match between the two files'
print 'Family data returned by p_ids() function as family_data =>',family_data
print '\npt_code in get genotypes() is %s \n'%sample_code
print 'tl1:',tl1
print 'gts dictionary is as follows'
for k,v in gts.items():
print k,'==>',v
raise Exception
#print 'counter is %s'%counter
#time.sleep(.5)
"""
#test code for the fucntion
if test:
print 'gts.iteritems()'
for k,v in gts.iteritems():
print k,v
if len(gts)!= len(myaffected) or gts == {}:
print "Error in get_genotypes()"
print "number of subjects is %s"%len(myaffected)
print 'myaffected=',myaffected
print "number of items in gts is %s"%len(gts)
print 'gts:',gts
print tl1
raise Exception
for k,v in gts.items():
#print 'genotype dict entry is k =%s v=>%s'%(k,v)
#time.sleep(1)
assert len(v)==12,'genotypes dictionary was not constructed correctly, please\
check the results k=%s, v=%s v is %s items long'%(k,v,len(v))
#time.sleep(1)
#print 'testcode gts=',gts
#print tl1
#time.sleep(30)
return gts
class varline(object):
"""
patient calculated zygosity state (hom_ref,
hom_var,
het_complete,
var_incoplete,
ref_incomplete,
novel_mutation,
ref_novel_het,
var_novel_het,
NA )
"""
ref_gt_count = 0
var_gt_count = 0
novel_gt_count = 0
ref_allele_count = 0
var_allele_count = 0
novel_allele_count = 0
def __init__(self,tl,htl,line=None):
self._gt_class = ''
self.pos = None
self.htl = htl
self.tl = tl
all_genotypes = []
self.all_genotypes_are_same = False
for x in htl:
if x.endswith('.NA'):
self.pos = htl.index(x)
break
self.mpg_coverage_score = mpg_scores(tl, htl)
assert isinstance(self.mpg_coverage_score, float)
for self.member in range(self.pos,len(self.tl),3):
#print 'member',self.member, 'len (line)',len(self.tl),'int (pos)',int(self.pos)
#get_ptcode(hline,pos)
#gt_index.append(pt_code)
###for self.value in self.tl[self._a:self._a+3]:
#self.gt_index=[]
self.gt_index = self.tl[self.member:self.member+3]
all_genotypes.append(self.gt_index[0])
if len(self.gt_index[0])<2:
#print "found incomplete genotype in ",self.tl
if self.gt_index[0]==self.tl[self.htl.index('ref_allele')]:
self._gt_class='ref_incomplete'
self.ref_allele_count+=1
elif self.gt_index[0]==self.tl[self.htl.index('var_allele')]:
self._gt_class='var_incomplete'
self.var_allele_count+=1
elif self.gt_index[0]!=self.tl[self.htl.index('var_allele')] and self.gt_index[0]!=self.tl[self.htl.index('var_allele')]:
self._gt_class='novel_incomplete'
self.novel_allele_count+=1
else:
print "something wrong check the row",il1[_a:_a+3]
elif len(self.gt_index[0])==2:
self.refcount=0
self.varcount=0
self.novelcount=0
if self.gt_index[0]=="NA":
#print "found NA",il1
pass
else:
for self.allele in xrange(2):
if self.gt_index[0][self.allele]==self.tl[self.htl.index('var_allele')]:
self.varcount+=1
elif self.gt_index[0][self.allele]==self.tl[self.htl.index('ref_allele')]:
self.refcount+=1
elif self.gt_index[0][self.allele]!=self.tl[self.htl.index('ref_allele')] and self.gt_index[0][self.allele]!=self.tl[self.htl.index('var_allele')]:
self.novelcount+=1
if self.varcount==2:
self._gt_class='hom_var'
self.var_gt_count+=1
self.var_allele_count+=2
elif self.refcount==2:
self._gt_class='hom_ref'
self.ref_gt_count+=1
self.ref_allele_count+=2
elif self.novelcount==2:
self._gt_class='hom_novel'
self.novel_gt_count+=1
self.novel_allele_count+=2
elif self.refcount==1 and self.varcount==1:
self._gt_class='het_complete'
self.ref_allele_count+=1
self.var_allele_count+=1
elif self.refcount==1 and self.novelcount==1:
self._gt_class='ref_novel_het'
self.ref_allele_count+=1
self.novel_allele_count+=1
elif self.varcount==1 and self.novelcount==1:
self._gt_class='var_novel_het'
self.var_allele_count+=1
self.novel_allele_count+=1
else:
print "something wrong check the row",il1[_a:_a+3]
#print 'self gt index',self.gt_index
#print 'self ref count',self.refcount
#print 'self var count',self.varcount
#print 'self novel count',self.novelcount
#print 'self gt class',self._gt_class
elif len(self.gt_index[0])>2:
if ":" in self.gt_index[0]:
self.varcount=0
self.refcount=0
self.novelcount=0
for self.allele in xrange(2):
#print "allele = %s "%allele
self.allele_gt = ""
if self.allele == 0:
self.allele_gt = self.gt_index[0][:(self.gt_index[0].index(":"))]
elif self.allele == 1:
self.allele_gt = self.gt_index[0][(self.gt_index[0].index(":"))+1:]
else:
print "allele gt could not be determined"
print "allele_gt is:",self.allele_gt
#test code
#print self.gt_index[0], self.allele_gt
#time.sleep(1)
if self.allele_gt == self.tl[self.htl.index('var_allele')]:
self.varcount+=1
elif self.allele_gt == self.tl[self.htl.index('ref_allele')]:
self.refcount+=1
elif self.allele_gt != self.tl[self.htl.index('ref_allele')] and self.allele_gt != self.tl[self.htl.index('var_allele')]:
self.novelcount+=1
else:
print "something wrong with determining the genotype counts , check the row",il1[_a:_a+3]
if self.varcount == 2:
self._gt_class = 'hom_var'
self.var_gt_count += 1
self.var_allele_count += 2
elif self.refcount == 2:
self._gt_class = 'hom_ref'
self.ref_gt_count += 1
self.ref_allele_count += 2
elif self.novelcount == 2:
self._gt_class='hom_novel'
self.novel_gt_count+=1
self.novel_allele_count+=2
elif self.refcount==1 and self.varcount==1:
self._gt_class='het_complete'
self.ref_allele_count+=1
self.var_allele_count+=1
elif self.refcount==1 and self.novelcount==1:
self._gt_class='ref_novel_het'
self.ref_allele_count+=1
self.novel_allele_count+=1
elif self.varcount==1 and self.novelcount==1:
self._gt_class='var_novel_het'
self.var_allele_count+=1
self.novel_allele_count+=1
else:
print "something wrong check the row",il1[_a:_a+3]
self.gt_index.append(self._gt_class)
if len(set(all_genotypes)) == 1:
self.all_genotypes_are_same = True
elif len(set(all_genotypes)) > 1:
self.all_genotypes_are_same = False
else:
raise Exception
def is_hom_rec(self,patient,line,varfile):
#work on this implement this
print 'line',line
print 'varfile',varfile
print patient
def scandirs_cdpred(inputfile, outputfile):
print "file: processing file: ",inputfile
myfile = open(inputfile)
hline = myfile.readline()
htl = hline.strip().split('\t')
muttype_present = False
if 'muttype' in hline:
muttype_present = True
cdneglines=[]
cdneglines.append(hline)
linecounter=0
snv_counter=0
indel_counter=0
for line in myfile:
linecounter+=1
tl=line.strip().split('\t')
try:
#print 'cdpred score',tl[myfilespecs['CDPred_score']]
#print 'cdpred score type',type(tl[myfilespecs['CDPred_score']])
#print 'cutoff',cutoff
#print 'cutoff type',type(cutoff)
#time.sleep(.2)
if 'polyPhen' in hline and ('CDPred_score' in hline):
if tl[htl.index('CDPred_score')] =='-':
cdneglines.append(line)
elif ( int(tl[htl.index('CDPred_score')] ) < int(mycfg.cdpred_cutoff) or
'damaging' in tl[htl.index('ss_polyPhen')]):
cdneglines.append(line)
else:
print 'CdPred is not a number or "-"'
raise Exception
elif 'polyPhen' in hline and (not 'CDPred_score' in hline):
#if 'damaging' in tl[htl.index('ss_polyPhen')]:
cdneglines.append(line)
else:
raise Exception
if muttype_present:
if tl[htl.index('muttype')]=="SNP":
snv_counter+=1
elif tl[htl.index('muttype')]=="INDEL":
indel_counter+=1
except Exception:
print """There is a problem with CDPred score or with PolyPhen score
either the scores are missing or the format is not correct. Please check the input files"""
print 'len cdneglines=',len(cdneglines)
write_lines(outputfile, cdneglines)
if linecounter != len(cdneglines):
print 'there were %s lines in the file, only %s of those were included in the cdneg file'%(linecounter,len(cdneglines))
tmpdir=os.path.dirname(inputfile)
fstats=open("%s/analysis_stats.txt"%tmpdir,"a")
fstats.write("Number of total variation lines in %s file is:\t%s"%(os.path.basename(inputfile),linecounter) +"\n" )
if muttype_present:
fstats.write("Number of SNVs in %s file is:\t%s"%(os.path.basename(inputfile),snv_counter) +"\n" )
fstats.write("Number of INDELs in %s file is:\t%s"%(os.path.basename(inputfile),indel_counter) +"\n" )
fstats.write("Total number of cd-pred negative lines are:\t%s\n"%(len(cdneglines)-1))
#end of stats
class db_class:
def create_tables(self):
vars_table = Table(
'final_variants',metadata,
Column('myindex', Integer, primary_key=True),
Column('project_code',String(50),nullable=False),
Column('mend_type',String(50),nullable=False),
Column('Chr',String(50),nullable=False),
Column('hg18LeftFlank',String(50),nullable=False),
Column('hg18RightFlank',String(50),nullable=False),
Column('hg19LeftFlank',String(50),nullable=False),
Column('hg19RightFlank',String(50),nullable=False),
Column('ref_allele',String(200),nullable=False),
Column('var_allele',String(200),nullable=False),
useexisting=True,
)
Index('idx_clr1', vars_table.c.mend_type,
vars_table.c.Chr,
vars_table.c.hg19LeftFlank,
vars_table.c.hg19RightFlank,
vars_table.c.var_allele)
compound_het_vars_table= Table(
'final_compound_het_variants',metadata,
Column('myindex',Integer,primary_key=True),
Column('project_code',String(50),nullable=False),
Column('mend_type',String(50),nullable=False),
Column('Chr1',String(50),nullable=False),
Column('hg18LeftFlank1',String(50),nullable=False),
Column('hg18RightFlank1',String(50),nullable=False),
Column('hg19LeftFlank1',String(50),nullable=False),
Column('hg19RightFlank1',String(50),nullable=False),
Column('ref_allele1',String(200),nullable=False),
Column('var_allele1',String(200),nullable=False),
Column('Chr2',String(50),nullable=False),
Column('hg18LeftFlank2',String(50),nullable=False),
Column('hg18RightFlank2',String(50),nullable=False),
Column('hg19LeftFlank2',String(50),nullable=False),
Column('hg19RightFlank2',String(50),nullable=False),
Column('ref_allele2',String(200),nullable=False),
Column('var_allele2',String(200),nullable=False),
useexisting=True,
)
Index('idx_clr2', compound_het_vars_table.c.mend_type,
compound_het_vars_table.c.Chr1,
compound_het_vars_table.c.hg18LeftFlank1,
compound_het_vars_table.c.hg18RightFlank1,
)
genotypes_table= Table(
'genotypes', metadata,
Column('myindex',None,ForeignKey('final_variants.myindex')),
Column('genotype',String(200),nullable=False),
Column('genotype_score',String(50),nullable=False),
Column('coverage',String(50),nullable=False),
useexisting=True,
)
metadata.create_all()
def check_and_add_db(self, htl=None, variantline1=None, variantline2=None,
project_code=None, mendtype=None):
"""
varfile is the file the variant line is extracted from ,used for
filespecs dictionary creation
variant line is the list of fileds generated by mendelian inheritance analysis
from the line in question
mendtype is the type of mendelian inheritence performed to create the
line,
project code is the deidentified code for sample studied
the format of the line is varsifter like
"""
# if this creates an error due to db tables not being created , call self.create_tables() in a try , except clause
vars_table = Table('final_variants', metadata, autoload=True)
compound_het_vars_table = Table('final_compound_het_variants',metadata, autoload=True)
genotypes_table = Table('genotypes', metadata, autoload=True)
# if this creates an error due to db tables not being created , call self.create_tables() in a try , except clause
test=0
verbose=1
if test:
print 'test code in db_class'
print 'variantline=',variantline
print 'project_code',project_code
print 'mendtype',mendtype
tl1=variantline1
tl2=variantline2
if mendtype in ('autozomal_rec','autozomal_dom','xlinked_rec'):
s = vars_table.select(and_ (vars_table.c.mend_type == mendtype,
vars_table.c.Chr == tl1[htl.index('Chr')],
vars_table.c.hg18LeftFlank == tl1[htl.index('LeftFlank')],
vars_table.c.hg18RightFlank == tl1[htl.index('RightFlank')],
vars_table.c.var_allele == tl1[htl.index('var_allele')]
))
test=0
if test:
print mendtype
print tl1[htl.index('Chr')]
print tl1[htl.index('LeftFlank')]
print tl1[htl.index('RightFlank')]
print tl1[htl.index('var_allele')]
raise Exception
rs = s.execute()
rows = rs.fetchall()
rs.close()
#print 'sqlalchemy:test: resultset rows is %s items'%len(rows)
foundsameflag=False
founddifferentflag=False
foundsamerow=None
founddifferentrow=[]
if rows:
for row in rows:
if not row[3].startswith('chr'):
print "row[3]",row[3]
print "row",row
raise Exception
if (row[1],row[2],row[3],row[4],row[5],row[9]) == (project_code,
mendtype,
tl1[htl.index('Chr')],
tl1[htl.index('LeftFlank')],
tl1[htl.index('RightFlank')],
tl1[htl.index('var_allele')],
):
foundsameflag=True
foundsamerow=row
elif (row[3],row[4],row[5],row[6],row[9]) == (
mendtype,
tl1[htl.index('Chr')],
tl1[htl.index('LeftFlank')],
tl1[htl.index('RightFlank')],
tl1[htl.index('var_allele')],
) and row[1]!= project_code:
founddifferentflag=True
founddifferentrow.append(row)
if foundsameflag == True:
pass
#print 'variant already in the database,skipping'
elif foundsameflag == False:
ins = vars_table.insert()
ins.execute(
myindex=None,
project_code = project_code,
mend_type = mendtype,
Chr = tl1[htl.index('Chr')].lower(),
hg18LeftFlank = tl1[htl.index('LeftFlank')],
hg18RightFlank = tl1[htl.index('RightFlank')],
hg19LeftFlank = tl1[htl.index('hg19LeftFlank')],
hg19RightFlank = tl1[htl.index('hg19RightFlank')],
ref_allele = tl1[htl.index('ref_allele')],
var_allele=tl1[htl.index('var_allele')],
)
#print 'not found in the database, added new record'
else:
raise Exception
return {'foundsameflag':foundsameflag,'founddifferentflag':founddifferentflag,
'foundsamerow':foundsamerow,'founddifferentrow':founddifferentrow}
elif mendtype == "compound_heterozygote":
if 'LeftFlank' in htl:
s = compound_het_vars_table.select(and_ ( compound_het_vars_table.c.project_code == project_code,
compound_het_vars_table.c.Chr1 == tl1[htl.index('Chr')],
compound_het_vars_table.c.hg18LeftFlank1 == tl1[htl.index('LeftFlank')],
compound_het_vars_table.c.hg18RightFlank1 == tl1[htl.index('RightFlank')],
))
rs = s.execute()
rows = rs.fetchall()
rs.close()
#print 'resultset of compound het rows is %s items'%len(rows)
foundsameflag=False
founddifferentflag=False
foundsamerow=None
founddifferentrow=[]
if rows:
for row in rows:
if ((row[1],row[2],
row[3],row[4],row[5],row[9],
row[10],row[11],row[12],row[16]) ==
(project_code, mendtype,
tl1[htl.index('Chr')],tl1[htl.index('LeftFlank')],tl1[htl.index('RightFlank')],tl1[htl.index('var_allele')],
tl2[htl.index('Chr')],tl2[htl.index('LeftFlank')],tl2[htl.index('RightFlank')],tl2[htl.index('var_allele')],
)):
foundsameflag = True
foundsamerow = row
elif ((row[2],
row[3],row[4],row[5],row[9],
row[10],row[11],row[12],row[16]) ==
(mendtype,
tl1[htl.index('Chr')],tl1[htl.index('LeftFlank')],tl1[htl.index('RightFlank')],tl1[htl.index('var_allele')],
tl2[htl.index('Chr')],tl2[htl.index('LeftFlank')],tl2[htl.index('RightFlank')],tl2[htl.index('var_allele')],
) and row[1] != project_code):
founddifferentflag = True
founddifferentrow.append(row)
if foundsameflag == True:
pass
#print 'variant already in the database,skipping'
elif foundsameflag == False:
ins = compound_het_vars_table.insert()
ins.execute(
myindex = None,
project_code = project_code,
mend_type = mendtype,
Chr1 = tl1[htl.index('Chr')].lower(),
hg18LeftFlank1 = tl1[htl.index('LeftFlank')],
hg18RightFlank1 = tl1[htl.index('RightFlank')],
hg19LeftFlank1 = tl1[htl.index('hg19LeftFlank')],
hg19RightFlank1 = tl1[htl.index('hg19RightFlank')],
ref_allele1=tl1[htl.index('ref_allele')],
var_allele1=tl1[htl.index('var_allele')],
Chr2=tl2[htl.index('Chr')],
hg18LeftFlank2=tl2[htl.index('LeftFlank')],
hg18RightFlank2=tl2[htl.index('RightFlank')],
hg19LeftFlank2=tl2[htl.index('hg19LeftFlank')],
hg19RightFlank2=tl2[htl.index('hg19RightFlank')],
ref_allele2=tl2[htl.index('ref_allele')],
var_allele2=tl2[htl.index('var_allele')],
)
#print 'not found in the database, added compound het pairs as new record'
else:
pass
else:
print "Mendelian type was not determined"
print "mendtype = %s"%mendtype
raise Exception
return {'foundsameflag':foundsameflag,'founddifferentflag':founddifferentflag,
'foundsamerow':foundsamerow,'founddifferentrow':founddifferentrow}
def penalty_score(tl,htl,cdnegfile,inheritance_model, project_code):
penaltyMAF=0.0
penaltyG5=0.0
penalty_linkage=0.0
penaltyHomozygousVarGT=0.0
penaltyVarAlleleFrequency=0.0
penaltyVarAlleleFrequency1=0.0
population_var_allele_count = 0.0
if 'Single_VarAllele_UDP_count' in htl:
if tl[htl.index('Single_VarAllele_UDP_count')] == '-':
pass
else:
population_var_allele_count += float(tl[htl.index('Single_VarAllele_UDP_count')])
if 'CS_572c_varallele' in htl:
if tl[htl.index('CS_572c_varallele')] == '-':
pass
else:
population_var_allele_count += float(tl[htl.index('CS_572c_varallele')])
myvarfile = varfile(htl)
myvarline = varline(tl,htl)
num_affected = myvarfile.get_num_affected(project_code)
total_num_samples = myvarfile.total_num_samples
if tl[htl.index('dbsnp137GMAF')] == '-' or tl[htl.index('dbsnp137GMAF')] == '':
pass
else:
penaltyMAF = float(tl[htl.index('dbsnp137GMAF')]) * 100
#-------------------------------------------------------------------------------
if tl[htl.index('dbsnp137G5')] == '+':
penaltyG5 = 50
#-------------------------------------------------------------------------------
if tl[htl.index('in_linkage_regions')] == '-':
penalty_linkage = 50
elif tl[htl.index('in_linkage_regions')] == 'NA' or tl[htl.index('in_linkage_regions')] == '+':
pass
else:
print """Error value in= tl[htl.index('in_linkage_regions')]""", tl[htl.index('in_linkage_regions')]
raise Exception
#penaltyHomozygousVarGT for local population data
if tl[htl.index('Homozygous_VarAllele_UDP_count')]=='-':
pass
elif int(tl[htl.index('Homozygous_VarAllele_UDP_count')])<=num_affected:
pass
elif int(tl[htl.index('Homozygous_VarAllele_UDP_count')]) > num_affected:
if inheritance_model in ('autozomal-rec','x-linked-rec'):
penaltyHomozygousVarGT += (int(tl[htl.index('Homozygous_VarAllele_UDP_count')]) - num_affected) * 1
elif inheritance_model in ('autozomal-dom','compound-heterozygous'):
penaltyHomozygousVarGT += (int(tl[htl.index('Homozygous_VarAllele_UDP_count')]) - num_affected) * 10
else:
print 'inheritance_model is not recognized'
raise Exception
else:
print 'ERROR: Unexpected value=>hom gt value is',tl[htl.index('Homozygous_VarAllele_UDP_count')]
print 'number of affecteds in the family is',num_affected
raise Exception
#penaltyHomozygousVarGT for ClinSeq data
if 'CS_572c_homvar' in htl:
if tl[htl.index('CS_572c_homvar')] == '-':
pass
elif int(tl[htl.index('CS_572c_homvar')]) >= 0:
if inheritance_model in ('autozomal-rec','x-linked-rec'):
penaltyHomozygousVarGT += int(tl[htl.index('CS_572c_homvar')])
elif inheritance_model in ('autozomal-dom','compound-heterozygous'):
penaltyHomozygousVarGT += int(tl[htl.index('CS_572c_homvar')]) * 10
else:
print 'inheritance_model is not recognized'
raise Exception
else:
print 'ERROR: Unexpected value=>ClinSeq hom_var_gt value is',tl[htl.index('CS_572c_homvar')]
print 'number of affecteds in the family is',num_affected
raise Exception
else:
pass
#allele frequency penalties for local population data
if population_var_allele_count == 0.0:
pass
#if 'PRF1' in tl:
# print 'tl[htl.index(Single_VarAllele_UDP_count)]',tl[htl.index('Single_VarAllele_UDP_count')]
# print 'myvarline.var_allele_count',myvarline.var_allele_count
if tl[htl.index('Single_VarAllele_UDP_count')] == '-':
pass
elif int(tl[htl.index('Single_VarAllele_UDP_count')]) <= int(myvarline.var_allele_count):
pass
elif int(tl[htl.index('Single_VarAllele_UDP_count')]) > int(myvarline.var_allele_count) :
if inheritance_model in ('autozomal-rec','x-linked-rec'):
if penaltyHomozygousVarGT == 0.0:
penaltyVarAlleleFrequency += float(tl[htl.index('Single_VarAllele_UDP_count')])
else:
penaltyVarAlleleFrequency += float(tl[htl.index('Single_VarAllele_UDP_count')]) * 10
elif inheritance_model in ('autozomal-dom','compound-heterozygous'):
if int(tl[htl.index('Homozygous_VarAllele_UDP_count')]) <= 0: #if there are no homozygous variants in the population
if int(tl[htl.index('Single_VarAllele_UDP_count')]) >= int(myvarline.var_allele_count):
penaltyVarAlleleFrequency += (float(tl[htl.index('Single_VarAllele_UDP_count')])- float(myvarline.var_allele_count)) #variant alleles in the population deduct penalty
elif int(myvarline.var_allele_count) > int(tl[htl.index('Single_VarAllele_UDP_count')]):
#if there are more variant alleles in the family/project than the population => variant allele is private to this family
pass
else: # if there are homozygous variant genotypes
if int(tl[htl.index('Single_VarAllele_UDP_count')]) >= int(myvarline.var_allele_count):
penaltyVarAlleleFrequency += ((float(tl[htl.index('Single_VarAllele_UDP_count')])- float(myvarline.var_allele_count))*10) #variant alleles in the population deduct penalty
elif int(myvarline.var_allele_count) > int(tl[htl.index('Single_VarAllele_UDP_count')]):
#if there are more variant alleles in the family/project than the population => variant allele is private to this family
pass
else:
print 'inheritance_model is not recognized model given is',inheritance_model
raise Exception
else:
print 'ERROR: Unexpected value=>hom gt value is',tl[htl.index('Single_VarAllele_UDP_count')]
print 'number of affecteds in the family is',num_affected
raise Exception
#allele frequency penalties for Clinseq frequencies
if 'CS_572c_homvar' in htl:
if tl[htl.index('CS_572c_varallele')] == '-' or tl[htl.index('CS_572c_varallele')] == '0':
pass
elif int(tl[htl.index('CS_572c_varallele')]) > 0:
if inheritance_model in ('autozomal-rec','x-linked-rec'):
if int(tl[htl.index('CS_572c_homvar')]) == 0: #if there are no homozygous variants in the population
penaltyVarAlleleFrequency1 += float(tl[htl.index('CS_572c_varallele')])
else:
penaltyVarAlleleFrequency1 += float(tl[htl.index('CS_572c_varallele')]) * 10
elif inheritance_model in ('autozomal-dom','compound-heterozygous'):
if int(tl[htl.index('CS_572c_homvar')]) == 0: #if there are no homozygous variants in the population
penaltyVarAlleleFrequency1 += float(tl[htl.index('CS_572c_varallele')])
else: # if there are homozygous variant genotypes
penaltyVarAlleleFrequency1 += float(tl[htl.index('CS_572c_varallele')]) * 10
else:
print 'inheritance_model is not recognized model given is',inheritance_model
raise Exception
else:
print 'ERROR: Unexpected value=> hom gt value is',tl[htl.index('CS_572c_varallele')]
print 'number of affecteds in the family is',num_affected
raise Exception
else:
pass
penaltyscore = - (penaltyMAF + penaltyG5 + penaltyHomozygousVarGT + penaltyVarAlleleFrequency + penaltyVarAlleleFrequency1 + penalty_linkage)
# if -5 > penaltyscore > -10:
# print 'penalty score',penaltyscore
# print 'single var_alellele_UDP count',tl[htl.index('Single_VarAllele_UDP_count')]
# print 'myvarline varallele count',int(myvarline.var_allele_count)
if penaltyscore > 0:
print 'Positive penalty score found please check function'
print 'penaltyMAF={0} penaltyG5={1} penaltyHomozygousVarGT={2} penaltyVarAlleleFrequency={3} penaltyVarAlleleFrequency1={4} penalty_linkage={5}'.format(penaltyMAF,
penaltyG5,
penaltyHomozygousVarGT,
penaltyVarAlleleFrequency,
penaltyVarAlleleFrequency1,
penalty_linkage)
print 'penalty score',penaltyscore
if 'CS_572c_homvar' in htl:
print 'CS_572c_varallele',tl[htl.index('CS_572c_varallele')]
print 'single var_alellele_UDP count',tl[htl.index('Single_VarAllele_UDP_count')]
print 'myvarline varallele count',int(myvarline.var_allele_count)
raise Exception
return (penaltyscore,
penaltyMAF,
penaltyG5,
penaltyHomozygousVarGT,
penaltyVarAlleleFrequency,
penalty_linkage)
def autosomal_recessive_filter(myaffected,mygts,tl1,htl,bed_coords,myfamcode):
"""
return 0 if not an autozomal recessive variant
return 1 if an autozomal recessive variant
"""
test = False
linepass = 1
hom_GTs=('A','G','C','T')
het_GTs=('R','Y','K','M','S','W')
inbed=0
for coord in bed_coords:
try:
if (tl1[htl.index('Chr')]==coord[0] and
int(tl1[htl.index('LeftFlank')]) >= int(coord[1]) and
int(tl1[htl.index('RightFlank')]) <= int(coord[2]) ):
inbed=1
except Exception:
sys.stderr.write("""There was a problem with the bed coordinates or the file coordinates
I was not able to convert the coordinates from the bed file and the coordinates from the input variant file
into integers and evaluate the values for bed file annotation. The entries that caused the problem are:
""")
sys.stderr.write(str(coord))
sys.stderr.write('\n%s,%s,%s\n'%(tl1[htl.index('Chr')],tl1[htl.index('LeftFlank')],tl1[htl.index('RightFlank')]))
raise Exception
all_affecteds_are_hom_var=1
#all_unaffecteds_are_not_hom_var=1
hemizygous_line=0
mom_hemizygous=0
dad_hemizygous=0
hemizygous_sample=''
for k,v in mygts.iteritems():
if test:
print k,v
if not (
(int(myaffected[k]['affected'])==1 and v[1]=="hom_var") or
(int(myaffected[k]['affected'])==0 and v[1]!="hom_var")
):
linepass=0
if test: print 'linepass= %s'%linepass
if int(myaffected[k]['affected'])==1 and v[1]!='hom_var':
all_affecteds_are_hom_var=0
#if int(myaffected[k]['affected'])==0 and v[1]=='hom_var':
# all_unaffecteds_are_not_hom_var=0
if test:
print 'all_affecteds_are_homvar =%s'%all_affecteds_are_hom_var
#print 'all_affecteds_are_not_hom_var =%s'%all_unaffecteds_are_not_hom_var
"""
if linepass and all_affecteds_are_hom_var:
if test:
print 'linepassed and allaffecteds are hom var, checking parent hemizygosity'
for k,v in mygts.iteritems():
if myaffected[k].get('mom') not in (None,"NA") and myaffected[k].get('dad') not in (None, "NA"):
#print "k", k
#print 'myaffected[k]',myaffected[k]
#print 'myaffected[l].get(mom)', myaffected[k].get('mom')
#print 'myaffected[k].get(dad)',myaffected[k].get('dad')
if (int(myaffected[k]['affected'])==1 and
v[1]=='hom_var' and
( (int(myaffected.get(myaffected[k].get('mom')).get('affected')) == 0 and
mygts.get(myaffected[k].get('mom'),[0,0])[1] =='hom_ref') and
(int(myaffected.get(myaffected[k].get('dad')).get('affected')) == 0 and
mygts.get( myaffected[k].get('dad'),[0,0])[1] == 'het_complete')
)):
mom_hemizygous=1
if (int(myaffected[k]['affected'])==1 and v[1]=='hom_var' and
((int(myaffected.get(myaffected[k].get('dad')).get('affected')) == 0 and
mygts.get(myaffected[k].get('dad'),[0,0])[1] == 'hom_ref') and
(int(myaffected.get(myaffected[k].get('mom')).get('affected')) == 0 and
mygts.get( myaffected[k].get('mom'),[0,0])[1]=='het_complete')
)):
dad_hemizygous=1
if mom_hemizygous or dad_hemizygous:
hemizygous_line = 1
hemizygous_sample = k
if hemizygous_line and inbed:
pass
# fastafile = pysam.Fastafile(mycfg.hg18fasta)
# print '****************I got a real one*********************'
# print 'line being evaluated is', tl1
# print 'hemizygous sample is %s '%hemizygous_sample
# print 'mom_hemizygous=%s'%mom_hemizygous ,'dad_hemizygous=%s'%dad_hemizygous
# interval=10000
# steps=20
# total_called_bases_count=0.0
# hom_GTs_count=0.0
# het_GTs_count=0.0
# other_GTs_count=0.0 #non hom non het gts B,D,H,V,N
# called_bases=[]
# histograms={}
# for udpcode in (hemizygous_sample, myaffected[hemizygous_sample]['mom'],myaffected[hemizygous_sample]['dad']):
# print udpcode
# bamfile = mycfg.input_bam_files[myfamcode][udpcode]
# print udpcode
# print bamfile
# samfile = pysam.Samfile( bamfile, "rb" )
# fetched= samfile.fetch('chr16',73310465,73310466)
# for x in fetched:
# print dir(x)
# print 'x',str(x),'\n'
# time.sleep(1)
#
# puped= samfile.pileup('chr16',73310465,73310466)
# for y in puped:
# if y.pos==73310466:
# print 'pos is %s and coverage is %s, pileups= %s and tid=%s'%(y.pos,y.n,y.pileups, y.tid)
# time.sleep(1)
# pileup_iter = samfile.pileup (stepper = 'samtools' , fastafile=fastafile)
# snpcaller= pysam.SNPCaller(pileup_iter)
# sc= snpcaller.call(chrname,pos)
#
# histograms[udpcode]=[]
#
# chrname=tl1[htl.index('Chr')]
# pos=int( tl1[htl.index('LeftFlank')])
# leftpos = pos-(interval*steps)
# rightpos = leftpos + interval
# for step in range(steps*2):
# het_GTs_count1=0.0
# leftpos += interval
# rightpos += interval
# pileup_iter = samfile.pileup (chrname ,leftpos , rightpos, stepper = 'samtools' , fastafile=fastafile)
#
# snpcall_iter= pysam.IteratorSNPCalls(pileup_iter)
# for sc in snpcall_iter:
# total_called_bases_count+=1
# # print '\npos',sc.pos
# # print 'reference_base',sc.reference_base
# # print 'genotype',sc.genotype
# # print 'coverage',sc.coverage
# # print 'consensus_quality',sc.consensus_quality
# # print 'mapping quality',sc.mapping_quality
# # print 'snp_quality',sc.snp_quality
# # print 'tid',sc.tid
# if sc.genotype in hom_GTs:
# hom_GTs_count+=1
# elif sc.genotype in het_GTs:
# het_GTs_count+=1
# het_GTs_count1+=1
# else:
# other_GTs_count+=1
# called_bases.append(sc.genotype)
# #print 'for genomic region %s - %s '%(leftpos, rightpos)
# #print '%s => Homozygous genotypes count= %s'%(udpcode,hom_GTs_count)
# #print '%s => Heterozygous genotypes count= %s'%(udpcode,het_GTs_count)
# #print 'other genotypes count= %s'%other_GTs_count
# #print '%s => Total %s bases called'%(udpcode,total_called_bases_count)
# #print called_bases
# #print 'ratio of homozygote GT over tatal calls %s'%(hom_GTs_count/total_called_bases_count)
# histograms[udpcode].append(het_GTs_count1)
# for udpcode in (hemizygous_sample, myaffected[hemizygous_sample]['mom'],myaffected[hemizygous_sample]['dad']):
# print 'histogram for %s'%udpcode
# for count in histograms[udpcode]:
# print '*'*int(count)
else:
pass
#print 'I got a bogus one'
"""
#if linepass:
# print 'returned linepass is',linepass
# if all_affecteds_are_hom_var and all_unaffecteds_are_not_hom_var:
# print 'all affecteds are hom var and all unaffecteds are not hom var',tl1
if test:
print 'final linepass is %s \n'%linepass
return linepass
def somatic_mutation_filter(cdnegfile, project_code,**kwargs):
"""
mendelian_mendfilters hom res, dom, x-linked recessive detection function
family.txt file must have the patient codes as UDPxxx.NA or UDPxxx and the format must match
for patient code and affected scripts otherwise get affected script will error
"""
sys.stderr.write('Mendelian filtering started for %s at %s\n'%(cdnegfile, time.ctime()))
start=time.time()
test=0
mydb = db_class()
mydb.create_tables()
lenlines =0
myfile = open(cdnegfile)
hline = myfile.readline()
for x in myfile:
lenlines+=1
fmm = open("%s/%smendelian_mismatches.txt"%(os.path.dirname(cdnegfile),os.path.basename(cdnegfile)),"a")
fmm.write(os.path.basename(cdnegfile)+'\n')
fmm.write(hline)
i=0.0
genes=open(mycfg.genes_to_exclude_path,"r").readlines()
mydb=db_class()
htl = hline.strip().split('\t')
htl.insert(htl.index('dbsnp137'),'fam_mpg:cov')
htl.insert(htl.index('dbsnp137'),'FIOFWSMIP') #Found In Other Family With Same Mendelian Inheritance Pattern
htl.insert(htl.index('dbsnp137'),'in_bed_regions') #Found In Other Family With Same Mendelian Inheritance Pattern
htl.insert(htl.index('dbsnp137'),'in_linkage_regions') #Found In Other Family With Same Mendelian Inheritance Pattern
htl.insert(htl.index('dbsnp137'),'in_cnv_regions') #Found In Other Family With Same Mendelian Inheritance Pattern
htl.insert(htl.index('dbsnp137'),'varmd_autozomal_rec') #Found In Other Family With Same Mendelian Inheritance Pattern
htl.insert(htl.index('dbsnp137'),'varmd_score_autozomal_rec') #Found In Other Family With Same Mendelian Inheritance Pattern
htl.insert(htl.index('dbsnp137'),'varmd_autozomal_dom') #Found In Other Family With Same Mendelian Inheritance Pattern
htl.insert(htl.index('dbsnp137'),'varmd_score_autozomal_dom') #Found In Other Family With Same Mendelian Inheritance Pattern
htl.insert(htl.index('dbsnp137'),'varmd_xlinked_rec') #Found In Other Family With Same Mendelian Inheritance Pattern
htl.insert(htl.index('dbsnp137'),'varmd_score_xlinked_rec') #Found In Other Family With Same Mendelian Inheritance Pattern
bed_coords=[]
cnv_bed_coords=[]
linkage_bed_coords=[]
if kwargs['include_bed_file'] != 'None':
bed_coords=[[x.strip('\r\n').split('\t')[0],\
x.strip('\r\n').split('\t')[1],\
x.strip('\r\n').split('\t')[2]]\
for x in open(kwargs['include_bed_file']).readlines()]
if kwargs['linkage_bed_file'] != 'None':
linkage_bed_coords=[[x.strip('\r\n').split('\t')[0],\
x.strip('\r\n').split('\t')[1],\
x.strip('\r\n').split('\t')[2]]\
for x in open(kwargs['linkage_bed_file']).readlines()]
if kwargs['cnv_bed_file'] != 'None':
cnv_bed_coords=[[x.strip('\r\n').split('\t')[0],\
x.strip('\r\n').split('\t')[1],\
x.strip('\r\n').split('\t')[2]]\
for x in open(kwargs['cnv_bed_file']).readlines()]
#print kwargs
#print 'bed_ccords', len(bed_coords)
#print 'linkage_bed_coords', len(linkage_bed_coords)
#print 'starting calculations', len(cnv_bed_coords)
mendfilters=['autozomal-rec','autozomal-dom','x-linked-rec']
#mendfilters=['autozomal-dom']
mendfiltered_list=[]
nheader='\t'.join(htl)+'\n'
mendfiltered_list.append(nheader) #header line preperation start
finallines=[] #list to be sorted before writing to file
myvarfile = varfile(htl)
myvarfile.get_project_members(project_code)
myaffected = myvarfile.project_members
first_sample_index = myvarfile.first_sample_index
linecounter=0
myfile.seek(0)
myfile.readline()
#for mendfilter in mendfilters:
patient_ids, family_data = p_ids()
print ' \t'+'\t'.join(htl)
for line in myfile:
linecounter += 1
#if linecounter == 5000:
# break
tl1=line.strip().split("\t")
tl1.insert(htl.index('fam_mpg:cov'),'-')
tl1.insert(htl.index('FIOFWSMIP'),'-')
tl1.insert(htl.index('in_bed_regions'),'NA')
tl1.insert(htl.index('in_linkage_regions'),'NA')
tl1.insert(htl.index('in_cnv_regions'),'NA')
tl1.insert(htl.index('varmd_autozomal_rec'),'')
tl1.insert(htl.index('varmd_score_autozomal_rec'),'')
tl1.insert(htl.index('varmd_autozomal_dom'),'-')
tl1.insert(htl.index('varmd_score_autozomal_dom'),'')
tl1.insert(htl.index('varmd_xlinked_rec'),'-')
tl1.insert(htl.index('varmd_score_xlinked_rec'),'')
if 'CDPred_score' in line:
if tl1[htl.index('CDPred_score')] == '301':
tl1[htl.index('CDPred_score')] = '0.00001'
mygts = get_genotypes(tl1, htl , myaffected, first_sample_index, family_data)
linepass_xlinked_rec = 1
all_affecteds_are_hom_var = 1
all_unaffecteds_are_not_hom_var = 1
linepass_autosomal_rec = autosomal_recessive_filter(myaffected, mygts, tl1,htl,cnv_bed_coords, project_code)
assert linepass_autosomal_rec in [0,1],('linepass value is not expected')
tl1[htl.index('varmd_autozomal_rec')] = str(linepass_autosomal_rec)
print '%s\t'%linepass_autosomal_rec+'\t'.join(tl1)
#autozomal dominant filter
linepass_autosomal_dom = 1
if tl1[htl.index("Chr")] not in ["chrX","chrY","chrM"]:
for k, v in mygts.iteritems():
if test:
print '\n\n','k=',k ,'\n','v=',v
print 'v[4]',v[4]
print 'v[5]',v[5],'myaffected[k]',myaffected[k]
time.sleep(1)
if not(
(int(myaffected[k]['affected'])==1 and (v[1]=="het_complete" or v[1]=="var_incomplete")) or
(int(myaffected[k]['affected'])==0 and (v[1]=="hom_ref" or v[1]=="ref_incomplete")) ):
linepass_autosomal_dom = 0
elif tl1[htl.index("Chr")] == "chrX":
for k, v in mygts.iteritems():
#print "x linked dominant test?"
#print '\n\n','k',k ,'\n','v',v
#print 'family data[k]',family_data[k]
#print 'v[4]',v[4]
#print 'v[5]',v[5],'myaffected[k]',myaffected[k]
#time.sleep(1)
if not(
(int(myaffected[k]['affected'])==1 and (v[1]=="het_complete" or v[1]=="var_incomplete")) or
(int(myaffected[k]['affected'])==0 and (v[1]=="hom_ref" or v[1]=="ref_incomplete")) ):
linepass_autosomal_dom = 0
elif tl1[htl.index("Chr")] in ["chrY","chrM"]:
pass
else:
print tl1
print 'chr is not recognized: ',tl1[htl.index("Chr")]
raise Exception
if test:
print 'linepass autosomal dom=',linepass_autosomal_dom
tl1[htl.index('varmd_autozomal_dom')] = str(linepass_autosomal_dom)
#x-linked reccessive filter
for k, v in mygts.iteritems():
if (
tl1[htl.index('Chr')].lower()!='chrX'.lower()
or
(v[4]=='f' and not
(
#(int(myaffected[k][4])==1 and
(int(myaffected[k]['affected'])==1 and
(
v[1]=="var_incomplete" or v[1]=="var_novel_het"
))
or
#(int(myaffected[k][4])==0 and
(int(myaffected[k]['affected'])==0 and
(
v[1]=="hom_ref" or
v[1]=="het_complete" or
v[1]=="ref_incomplete" or
v[1]=="ref_novel_het"
))
))
or
#if a male is affected his genotype should have a single allele of variant
#if he is not affected his genotype should be either hom_ref of ref_incomplete
#with no variation at all present
#if these criteri are not met than the line fails to pass the filter
#if male and not ( affected and gt==var_incomplete) or (not affected and gt==either hom_ref or ref_incomplete)
#line doesn't pass filter
(v[4]=='m' and
#not( (int(myaffected[k][4])==1 and
not( (int(myaffected[k]['affected'])==1 and
(
#v[1]=="het_complete" or
v[1]=="var_incomplete"
)
) or
#(int(myaffected[k][4])==0 and
(int(myaffected[k]['affected'])==0 and
( v[1]=="hom_ref" or
v[1]=="ref_incomplete" )
)))
):
linepass_xlinked_rec = 0
tl1[htl.index('varmd_xlinked_rec')] = str(linepass_xlinked_rec)
#if all_affecteds_are_hom_var and all_unaffecteds_are_not_hom_var:
# print tl1
#print 'linepass',linepass
#if linepass==1:
# print 'line passed'
#elif linepass==0:
# print 'line did not pass'
genepass = 1
file_gene = ''
if 'refseq' in hline:
file_gene = tl1[htl.index('refseq')].lower()
elif 'ss_geneList' in hline and (not 'refseq' in hline):
file_gene = tl1[htl.index('ss_geneList')].lower()
for item in genes:
gene=item.strip()
delim=';'
if delim in item:
gene=item.split(delim)[0]
#print 'gene from line',tl1[htl.index('refseq')]
#print 'gene from list',gene
#time.sleep(1)
if file_gene == gene.lower():
#print 'found gene'
#print 'tl1 gene',tl1[htl.index('refseq')].lower()
#print gene.lower()
#time.sleep(1)
genepass=0
else:
#print 'not found gene'
#print 'tl1 gene',tl1[htl.index('refseq')].lower()
#print gene.lower()
pass
my_mpg_score = mpg_scores(tl1 , htl)
#HWE_wrong=check_HWE(line,cdnegfile)
#if 'PRF1' in line:
if test:
print tl1
print 'linepass_autosomal_rec',linepass_autosomal_rec
print 'linepass_autosomal_dom',linepass_autosomal_dom
print 'linepass_xlinked_rec',linepass_xlinked_rec
print 'genepass',genepass
print 'my_mpg_score %s \n'%str(my_mpg_score)
time.sleep(1)
if linepass_autosomal_rec == 0:
pass
#print '\nline that did not pass autozomal recessive filter\n',tl1
#time.sleep(1)
if ( (linepass_autosomal_rec or linepass_autosomal_dom or linepass_xlinked_rec) and
genepass and my_mpg_score>0.5 and
'random' not in tl1[htl.index('Chr')].lower() ):
# if line meets the Mendelian criteria and the gene is not an unimportant gene and mpg/coverage is good (>0.4) in at least half of the individuals
#print "hom res match found\n",tl1,"\n"
if kwargs['include_bed_file']!='None':
tl1[htl.index('in_bed_regions')]='-'
for coord in bed_coords:
if (tl1[htl.index('Chr')]==coord[0] and
int(tl1[htl.index('LeftFlank')]) >= int(coord[1]) and
int(tl1[htl.index('RightFlank')]) <= int(coord[2]) ):
tl1[htl.index('in_bed_regions')]='+'
if kwargs['linkage_bed_file']!='None':
tl1[htl.index('in_linkage_regions')]='-'
for coord in linkage_bed_coords:
if (tl1[htl.index('Chr')]==coord[0] and
int(tl1[htl.index('LeftFlank')]) >= int(coord[1]) and
int(tl1[htl.index('RightFlank')]) <= int(coord[2]) ):
tl1[htl.index('in_linkage_regions')]='+'
if kwargs['cnv_bed_file']!='None':
tl1[htl.index('in_cnv_regions')]='-'
for coord in cnv_bed_coords:
if (tl1[htl.index('Chr')]==coord[0] and
int(tl1[htl.index('LeftFlank')]) >= int(coord[1]) and
int(tl1[htl.index('RightFlank')]) <= int(coord[2]) ):
tl1[htl.index('in_cnv_regions')]='+'
#it is important to keep the linkage_region annotation before the penalty score calculation
#as this information is used in the penalty score calculations
penaltyscore_autozomal_rec = penalty_score(tl1,htl,cdnegfile,"autozomal-rec", project_code)
penaltyscore_autozomal_dom = penalty_score(tl1,htl,cdnegfile,"autozomal-dom", project_code)
penaltyscore_xlinked_rec = penalty_score(tl1,htl,cdnegfile,"x-linked-rec", project_code)
#tl1[htl.index('varscore')]=str(float(tl1[htl.index('varscore')])-penaltyscore)
tl1[htl.index('varmd_score_autozomal_rec')] = str(penaltyscore_autozomal_rec[0])
tl1[htl.index('varmd_score_autozomal_dom')] = str(penaltyscore_autozomal_dom[0])
tl1[htl.index('varmd_score_xlinked_rec')] = str(penaltyscore_xlinked_rec[0])
tl1[htl.index('varscore')] = str(max( penaltyscore_autozomal_rec[0],
penaltyscore_autozomal_dom[0],
penaltyscore_xlinked_rec[0]) )
#debug remove
tl1[htl.index('fam_mpg:cov')] = str(my_mpg_score) #add mpg:coverage score to the file
finallines.append(tl1)
#ATTENTION
#database driven found in other families with same mendelian model penalty, too slow,
#the database function works fine but it is slow, try to improve speed and reenable it
if 'CDPred_score' in hline and tl1[htl.index('CDPred_score')]=='-':
tl1[htl.index('CDPred_score')] = '0.001' #not zero to differentiate from real zero values but almost zero in value
# time.sleep(5)
#elif resultdict['foundsameflag'] == True:
# print 'found same variant in same family; previously entered varint row passing'
# print resultdict['foundsamerow']
if linecounter % 50000 == 0:
sys.stderr.write('line %s of %s is processed\n'%(linecounter,lenlines))
#determines mismatches between my algorithm and nancies
if 'MendHomRec' in hline:
if int(tl1[htl.index('MendHomRec')]) == 1 and linepass_autosomal_rec == 0:
#print 'tl1',tl1
fmm.write('Autozomal recessive mismatch:\n')
fmm.write("\t".join(tl1)+'\n')
if 'MendDom' in hline:
if int(tl1[htl.index('MendDom')]) == 1 and linepass_autosomal_dom == 0:
#print 'tl1',tl1
fmm.write('Autozomal dominant mismatch:\n')
fmm.write("\t".join(tl1)+'\n')
if 'CDPred_score' in hline:
s_finallines = sorted(finallines, key = lambda line:(float(line[htl.index('varscore')]),abs(float(line[htl.index('CDPred_score')])) ), reverse=True)
else:
s_finallines = sorted(finallines, key=lambda line:(float(line[htl.index('varscore')])), reverse=True)
for tabline in s_finallines:
mendfiltered_list.append("\t".join(tabline)+"\n")
file_path = args.output_file
#close_variants(mendfiltered_list)
#raise Exception
write_lines(file_path, mendfiltered_list)
print 'Mendelian analysis output:{}'.format(file_path)
fstats=open("%s/analysis_stats.txt"%os.path.dirname(cdnegfile),"a")# write file stats
fstats.write("The number of variant lines that passed (autozomal_rec, autozomal_dom, xlinked_rec are:\t{} \n".format( len(mendfiltered_list)-1,
))
fstats.close()
end=time.time()
print "Elapsed time = {} min".format((end-start)/60 )
fmm.close()
def simple_mendelian_filters(cdnegfile, project_code,**kwargs):
"""
mendelian_mendfilters hom res, dom, x-linked recessive detection function
family.txt file must have the patient codes as UDPxxx.NA or UDPxxx and the format must match
for patient code and affected scripts otherwise get affected script will error
This function annotates the file with coordinates from upto three bed files. The coordinates in the bed files
must be hg19 based.
"""
sys.stderr.write('Mendelian filtering started for %s at %s\n'%(cdnegfile, time.ctime()))
start=time.time()
test=0
mydb = db_class()
mydb.create_tables()
lenlines =0
myfile = open(cdnegfile)
hline = myfile.readline()
for x in myfile:
lenlines+=1
fmm = open("%s/%smendelian_mismatches.txt"%(os.path.dirname(cdnegfile),os.path.basename(cdnegfile)),"a")
fmm.write(os.path.basename(cdnegfile)+'\n')
fmm.write(hline)
i=0.0
genes=open(mycfg.genes_to_exclude_path,"r").readlines()
mydb=db_class()
htl = hline.strip().split('\t')
htl.insert(htl.index('dbsnp137'),'fam_mpg:cov')
htl.insert(htl.index('dbsnp137'),'FIOFWSMIP') #Found In Other Family With Same Mendelian Inheritance Pattern
htl.insert(htl.index('dbsnp137'),'in_bed_regions') #Found In Other Family With Same Mendelian Inheritance Pattern
htl.insert(htl.index('dbsnp137'),'in_linkage_regions') #Found In Other Family With Same Mendelian Inheritance Pattern
htl.insert(htl.index('dbsnp137'),'in_cnv_regions') #Found In Other Family With Same Mendelian Inheritance Pattern
htl.insert(htl.index('dbsnp137'),'varmd_autozomal_rec') #Found In Other Family With Same Mendelian Inheritance Pattern
htl.insert(htl.index('dbsnp137'),'varmd_score_autozomal_rec') #Found In Other Family With Same Mendelian Inheritance Pattern
htl.insert(htl.index('dbsnp137'),'varmd_autozomal_dom') #Found In Other Family With Same Mendelian Inheritance Pattern
htl.insert(htl.index('dbsnp137'),'varmd_score_autozomal_dom') #Found In Other Family With Same Mendelian Inheritance Pattern
htl.insert(htl.index('dbsnp137'),'varmd_xlinked_rec') #Found In Other Family With Same Mendelian Inheritance Pattern
htl.insert(htl.index('dbsnp137'),'varmd_score_xlinked_rec') #Found In Other Family With Same Mendelian Inheritance Pattern
bed_coords=[]
cnv_bed_coords=[]
linkage_bed_coords=[]
if kwargs['include_bed_file'] != 'None':
bed_coords=[[x.strip('\r\n').split('\t')[0],\
x.strip('\r\n').split('\t')[1],\
x.strip('\r\n').split('\t')[2]]\
for x in open(kwargs['include_bed_file']).readlines()]
if kwargs['linkage_bed_file'] != 'None':
linkage_bed_coords=[[x.strip('\r\n').split('\t')[0],\
x.strip('\r\n').split('\t')[1],\
x.strip('\r\n').split('\t')[2]]\
for x in open(kwargs['linkage_bed_file']).readlines()]
if kwargs['cnv_bed_file'] != 'None':
cnv_bed_coords=[[x.strip('\r\n').split('\t')[0],\
x.strip('\r\n').split('\t')[1],\
x.strip('\r\n').split('\t')[2]]\
for x in open(kwargs['cnv_bed_file']).readlines()]
#print kwargs
#print 'bed_ccords', len(bed_coords)
#print 'linkage_bed_coords', len(linkage_bed_coords)
#print 'starting calculations', len(cnv_bed_coords)
mendfilters=['autozomal-rec','autozomal-dom','x-linked-rec']
#mendfilters=['autozomal-dom']
mendfiltered_list=[]
nheader='\t'.join(htl)+'\n'
mendfiltered_list.append(nheader) #header line preperation start
finallines=[] #list to be sorted before writing to file
myvarfile = varfile(htl)
myvarfile.get_project_members(project_code)
myaffected = myvarfile.project_members
first_sample_index = myvarfile.first_sample_index
linecounter=0
myfile.seek(0)
myfile.readline()
#for mendfilter in mendfilters:
patient_ids, family_data = p_ids()
#print ' \t'+'\t'.join(htl)
gt_qual_scores ={}
for line in myfile:
linecounter += 1
#if linecounter == 5000:
# break
tl1=line.strip().split("\t")
tl1.insert(htl.index('fam_mpg:cov'),'-')
tl1.insert(htl.index('FIOFWSMIP'),'-')
tl1.insert(htl.index('in_bed_regions'),'NA')
tl1.insert(htl.index('in_linkage_regions'),'NA')
tl1.insert(htl.index('in_cnv_regions'),'NA')
tl1.insert(htl.index('varmd_autozomal_rec'),'')
tl1.insert(htl.index('varmd_score_autozomal_rec'),'')
tl1.insert(htl.index('varmd_autozomal_dom'),'-')
tl1.insert(htl.index('varmd_score_autozomal_dom'),'')
tl1.insert(htl.index('varmd_xlinked_rec'),'-')
tl1.insert(htl.index('varmd_score_xlinked_rec'),'')
if 'CDPred_score' in line:
if tl1[htl.index('CDPred_score')] == '301':
tl1[htl.index('CDPred_score')] = '0.00001'
mygts = get_genotypes(tl1, htl , myaffected, first_sample_index, family_data)
linepass_xlinked_rec = 1
all_affecteds_are_hom_var = 1
all_unaffecteds_are_not_hom_var = 1
linepass_autosomal_rec = autosomal_recessive_filter(myaffected, mygts, tl1,htl,cnv_bed_coords, project_code)
assert linepass_autosomal_rec in [0,1],('linepass value is not expected')
tl1[htl.index('varmd_autozomal_rec')] = str(linepass_autosomal_rec)
#autozomal dominant filter
linepass_autosomal_dom = 1
if tl1[htl.index("Chr")] not in ["chrX","chrY","chrM"]:
for k, v in mygts.iteritems():
if test:
print '\n\n','k=',k ,'\n','v=',v
print 'v[4]',v[4]
print 'v[5]',v[5],'myaffected[k]',myaffected[k]
time.sleep(1)
if not(
(int(myaffected[k]['affected'])==1 and (v[1]=="het_complete" or v[1]=="var_incomplete")) or
(int(myaffected[k]['affected'])==0 and (v[1]=="hom_ref" or v[1]=="ref_incomplete")) ):
linepass_autosomal_dom = 0
elif tl1[htl.index("Chr")] == "chrX":
for k, v in mygts.iteritems():
#print "x linked dominant test?"
#print '\n\n','k',k ,'\n','v',v
#print 'family data[k]',family_data[k]
#print 'v[4]',v[4]
#print 'v[5]',v[5],'myaffected[k]',myaffected[k]
#time.sleep(1)
if not(
(int(myaffected[k]['affected'])==1 and (v[1]=="het_complete" or v[1]=="var_incomplete")) or
(int(myaffected[k]['affected'])==0 and (v[1]=="hom_ref" or v[1]=="ref_incomplete")) ):
linepass_autosomal_dom = 0
elif tl1[htl.index("Chr")] in ["chrY","chrM"]:
pass
else:
print tl1
print 'chr is not recognized: ',tl1[htl.index("Chr")]
raise Exception
if test:
print 'linepass autosomal dom=',linepass_autosomal_dom
tl1[htl.index('varmd_autozomal_dom')] = str(linepass_autosomal_dom)
#x-linked reccessive filter
for k, v in mygts.iteritems():
if (
tl1[htl.index('Chr')].lower()!='chrX'.lower()
or
(v[4]=='f' and not
(
#(int(myaffected[k][4])==1 and
(int(myaffected[k]['affected'])==1 and
(
v[1]=="var_incomplete" or v[1]=="var_novel_het"
))
or
#(int(myaffected[k][4])==0 and
(int(myaffected[k]['affected'])==0 and
(
v[1]=="hom_ref" or
v[1]=="het_complete" or
v[1]=="ref_incomplete" or
v[1]=="ref_novel_het"
))
))
or
#if a male is affected his genotype should have a single allele of variant
#if he is not affected his genotype should be either hom_ref of ref_incomplete
#with no variation at all present
#if these criteri are not met than the line fails to pass the filter
#if male and not ( affected and gt==var_incomplete) or (not affected and gt==either hom_ref or ref_incomplete)
#line doesn't pass filter
(v[4]=='m' and
#not( (int(myaffected[k][4])==1 and
not( (int(myaffected[k]['affected'])==1 and
(
#v[1]=="het_complete" or
v[1]=="var_incomplete"
)
) or
#(int(myaffected[k][4])==0 and
(int(myaffected[k]['affected'])==0 and
( v[1]=="hom_ref" or
v[1]=="ref_incomplete" )
)))
):
linepass_xlinked_rec = 0
tl1[htl.index('varmd_xlinked_rec')] = str(linepass_xlinked_rec)
#if all_affecteds_are_hom_var and all_unaffecteds_are_not_hom_var:
# print tl1
#print 'linepass',linepass
#if linepass==1:
# print 'line passed'
#elif linepass==0:
# print 'line did not pass'
genepass = 1
file_gene = ''
if 'refseq' in hline:
file_gene = tl1[htl.index('refseq')].lower()
elif 'ss_geneList' in hline and (not 'refseq' in hline):
file_gene = tl1[htl.index('ss_geneList')].lower()
for item in genes:
gene=item.strip()
delim=';'
if delim in item:
gene=item.split(delim)[0]
#print 'gene from line',tl1[htl.index('refseq')]
#print 'gene from list',gene
#time.sleep(1)
if file_gene == gene.lower():
#print 'found gene'
#print 'tl1 gene',tl1[htl.index('refseq')].lower()
#print gene.lower()
#time.sleep(1)
genepass=0
else:
#print 'not found gene'
#print 'tl1 gene',tl1[htl.index('refseq')].lower()
#print gene.lower()
pass
my_mpg_score = mpg_scores(tl1 , htl)
if not gt_qual_scores.get(my_mpg_score):
gt_qual_scores[my_mpg_score] = 1
else:
gt_qual_scores[my_mpg_score] += 1
#HWE_wrong=check_HWE(line,cdnegfile)
#if 'PRF1' in line:
if test and (linepass_autosomal_rec or linepass_autosomal_dom or linepass_xlinked_rec):
print tl1
print 'linepass_autosomal_rec',linepass_autosomal_rec
print 'linepass_autosomal_dom',linepass_autosomal_dom
print 'linepass_xlinked_rec',linepass_xlinked_rec
print 'genepass',genepass
print 'my_mpg_score %s \n'%str(my_mpg_score)
time.sleep(1)
if ( (linepass_autosomal_rec or linepass_autosomal_dom or linepass_xlinked_rec) and
genepass and
my_mpg_score > 0.5 and
'random' not in tl1[htl.index('Chr')].lower() ):
# if line meets the Mendelian criteria and the gene is not an unimportant gene and mpg/coverage is good (>0.4) in at least half of the individuals
#print "hom res match found\n",tl1,"\n"
if kwargs['include_bed_file']!='None':
tl1[htl.index('in_bed_regions')]= '-'
for coord in bed_coords:
if (tl1[htl.index('Chr')]==coord[0] and
int(tl1[htl.index('hg19LeftFlank')]) >= int(coord[1]) and
int(tl1[htl.index('hg19RightFlank')]) <= int(coord[2]) ):
tl1[htl.index('in_bed_regions')]='+'
if kwargs['linkage_bed_file']!='None':
tl1[htl.index('in_linkage_regions')]='-'
for coord in linkage_bed_coords:
if (tl1[htl.index('Chr')]==coord[0] and
int(tl1[htl.index('hg19LeftFlank')]) >= int(coord[1]) and
int(tl1[htl.index('hg19RightFlank')]) <= int(coord[2]) ):
tl1[htl.index('in_linkage_regions')]='+'
if kwargs['cnv_bed_file']!='None':
tl1[htl.index('in_cnv_regions')]='-'
for coord in cnv_bed_coords:
if (tl1[htl.index('Chr')]==coord[0] and
int(tl1[htl.index('hg19LeftFlank')]) >= int(coord[1]) and
int(tl1[htl.index('hg19RightFlank')]) <= int(coord[2]) ):
tl1[htl.index('in_cnv_regions')]='+'
#it is important to keep the linkage_region annotation before the penalty score calculation
#as this information is used in the penalty score calculations
penaltyscore_autozomal_rec = penalty_score(tl1,htl,cdnegfile,"autozomal-rec", project_code)
penaltyscore_autozomal_dom = penalty_score(tl1,htl,cdnegfile,"autozomal-dom", project_code)
penaltyscore_xlinked_rec = penalty_score(tl1,htl,cdnegfile,"x-linked-rec", project_code)
#tl1[htl.index('varscore')]=str(float(tl1[htl.index('varscore')])-penaltyscore)
tl1[htl.index('varmd_score_autozomal_rec')] = str(penaltyscore_autozomal_rec[0])
tl1[htl.index('varmd_score_autozomal_dom')] = str(penaltyscore_autozomal_dom[0])
tl1[htl.index('varmd_score_xlinked_rec')] = str(penaltyscore_xlinked_rec[0])
tl1[htl.index('varscore')] = str(max( penaltyscore_autozomal_rec[0],
penaltyscore_autozomal_dom[0],
penaltyscore_xlinked_rec[0]) )
#debug remove
tl1[htl.index('fam_mpg:cov')] = str(my_mpg_score) #add mpg:coverage score to the file
finallines.append(tl1)
#ATTENTION
#database driven found in other families with same mendelian model penalty, too slow,
#the database function works fine but it is slow, try to improve speed and reenable it
# fiof_penalty=0
# if linepass_autosomal_rec:
# resultdict = mydb.check_and_add_db(headerline = htl,
# variantline1 = tl1,
# project_code = project_code,
# mendtype = "autozomal_rec"
# )
# if linepass_autosomal_dom:
# resultdict = mydb.check_and_add_db(headerline = htl,
# variantline1 = tl1,
# project_code = project_code,
# mendtype = "autozomal_dom"
# )
# if linepass_xlinked_rec:
# resultdict = mydb.check_and_add_db(headerline = htl,
# variantline1 = tl1,
# project_code = project_code,
# mendtype = "xlinked_rec"
# )
#
# if resultdict['founddifferentflag'] == True:
# #print 'found same variant in different family==>',resultdict['founddifferentrow']
# #print 'line in this family is',tl1
# for item in resultdict['founddifferentrow']:
# tl1[htl.index('FIOFWSMIP')]+='%s,'%item[1]
# fiof_penalty -= 100
# tl1[htl.index('varscore')] = str((float(tl1[htl.index('varscore')])+ fiof_penalty))
if 'CDPred_score' in hline and tl1[htl.index('CDPred_score')]=='-':
tl1[htl.index('CDPred_score')] = '0.001' #not zero to differentiate from real zero values but almost zero in value
# time.sleep(5)
#elif resultdict['foundsameflag'] == True:
# print 'found same variant in same family; previously entered varint row passing'
# print resultdict['foundsamerow']
if linecounter % 50000 == 0:
sys.stderr.write('line %s of %s is processed\n'%(linecounter,lenlines))
#determines mismatches between my algorithm and nancies
if 'MendHomRec' in hline:
if int(tl1[htl.index('MendHomRec')]) == 1 and linepass_autosomal_rec == 0:
#print 'tl1',tl1
fmm.write('Autozomal recessive mismatch:\n')
fmm.write("\t".join(tl1)+'\n')
if 'MendDom' in hline:
if int(tl1[htl.index('MendDom')]) == 1 and linepass_autosomal_dom == 0:
#print 'tl1',tl1
fmm.write('Autozomal dominant mismatch:\n')
fmm.write("\t".join(tl1)+'\n')
for k,v in gt_qual_scores.iteritems():
print k,v
print 'len(finallines)',len(finallines)
print linecounter
if 'CDPred_score' in hline:
s_finallines = sorted(finallines, key = lambda line:(float(line[htl.index('varscore')]),abs(float(line[htl.index('CDPred_score')])) ), reverse=True)
else:
s_finallines = sorted(finallines, key=lambda line:(float(line[htl.index('varscore')])), reverse=True)
for tabline in s_finallines:
mendfiltered_list.append("\t".join(tabline)+"\n")
file_path = args.output_file
#close_variants(mendfiltered_list)
#raise Exception
write_lines(file_path, mendfiltered_list)
print 'Mendelian analysis output:{}'.format(file_path)
fstats=open("%s/analysis_stats.txt"%os.path.dirname(cdnegfile),"a")# write file stats
fstats.write("The number of variant lines that passed (autozomal_rec, autozomal_dom, xlinked_rec are:\t{} \n".format( len(mendfiltered_list)-1,
))
fstats.close()
end=time.time()
print "Elapsed time = {} min".format((end-start)/60 )
fmm.close()
def mendelian_filter_com_het_res(cdnegfile, outputfile, project_code,**kwargs):
"""
this function compares two lines and finds compund heterozygotes
if genotypes are similar but affected status don't match it should eliminate these false positive results such as AG AG AG AG
although G is variant since it is found in unaffecteds in geterozygote state it should be eliminated from the result set
"""
test=0
sys.stderr.write('Compound heterozygous mendelian filtering started for %s at %s\n'%(cdnegfile, time.ctime()))
lineslist = open(cdnegfile).readlines()
start=time.time()
i=0.0
mydb = db_class()
mydb.create_tables()
htl=lineslist[0].strip().split("\t")
myvarfile = varfile(htl)
com_het_res=[]
htl.insert(htl.index('dbsnp137'),'fam_mpg:cov')
htl.insert(htl.index('dbsnp137'),'FIOFWSMIP') #Found In Other Family With Same Mendelian Inheritance Pattern
htl.insert(htl.index('dbsnp137'),'in_bed_regions') #Found In Other Family With Same Mendelian Inheritance Pattern
htl.insert(htl.index('dbsnp137'),'in_linkage_regions') #Found In Other Family With Same Mendelian Inheritance Pattern
htl.insert(htl.index('dbsnp137'),'in_cnv_regions') #Found In Other Family With Same Mendelian Inheritance Pattern
nheader='\t'.join(htl)+'\n'
lineslist1=deepcopy(lineslist[1:])
lineslist2=deepcopy(lineslist[1:])
numlines = len(lineslist1)
myvarfile.get_project_members(project_code)
myaffected = myvarfile.project_members
#for k,v in myaffected.iteritems():
# print "key = %s , value=%s"%( k,v)
com_het_res.append(nheader) #add the headerline to the file as the first line
genes_toexclude = open(mycfg.genes_to_exclude_path,"r").readlines()
controllines={} #controllines dictionary has each gene as key and all variation lines for that gene as values
for line1 in lineslist1:
tl1=line1.strip().split("\t")
tl1.insert(htl.index('fam_mpg:cov'),'-')
tl1.insert(htl.index('FIOFWSMIP'),'-') #Found In Other Family With Same Mendelian Inheritance Pattern
tl1.insert(htl.index('in_bed_regions'),'NA')
tl1.insert(htl.index('in_linkage_regions'),'NA')
tl1.insert(htl.index('in_cnv_regions'),'NA')
file_gene=''
if 'refseq' in lineslist[0]:
file_gene = tl1[htl.index('refseq')].lower()
elif not ('refseq' in lineslist[0]) and ('Gene_name' in lineslist[0]):
file_gene = tl1[htl.index('Gene_name')].lower()
elif not ('refseq' in lineslist[0]) and not ('Gene_name' in lineslist[0]) and 'ss_geneList' in lineslist[0]:
file_gene = tl1[htl.index('ss_geneList')].lower()
else:
raise Exception
if not file_gene in controllines:
controllines[file_gene] = [line1]
elif file_gene in controllines:
controllines[file_gene].append(line1)
#print '\n\n controller:',controllines
#time.sleep(1)
bed_coords=[]
cnv_bed_coords=[]
linkage_bed_coords=[]
if kwargs['include_bed_file'] != 'None':
bed_coords=[[x.strip('\r\n').split('\t')[0],\
x.strip('\r\n').split('\t')[1],\
x.strip('\r\n').split('\t')[2]]\
for x in open(kwargs['include_bed_file']).readlines()]
if kwargs['linkage_bed_file'] != 'None':
linkage_bed_coords=[[x.strip('\r\n').split('\t')[0],\
x.strip('\r\n').split('\t')[1],\
x.strip('\r\n').split('\t')[2]]\
for x in open(kwargs['linkage_bed_file']).readlines()]
if kwargs['cnv_bed_file'] != 'None':
cnv_bed_coords=[[x.strip('\r\n').split('\t')[0],\
x.strip('\r\n').split('\t')[1],\
x.strip('\r\n').split('\t')[2]]\
for x in open(kwargs['cnv_bed_file']).readlines()]
linestosort=[]
first_sample_index = myvarfile.first_sample_index
patient_ids, family_data = p_ids()
if myvarfile.total_num_samples == 1:
sys.stderr.write('Only a single subjects variants are found, skipping compound het analysis\n')
sys.stderr.write('Please re-run compound het analysis is more members\' data are added to the file\n')
else:
for line1 in lineslist1:
i+=1
pt_d1={}
tl1=line1.strip().split("\t")
tl1.insert(htl.index('fam_mpg:cov'),'-')
tl1.insert(htl.index('FIOFWSMIP'),'-') #Found In Other Family With Same Mendelian Inheritance Pattern
tl1.insert(htl.index('in_bed_regions'),'NA')
tl1.insert(htl.index('in_linkage_regions'),'NA')
tl1.insert(htl.index('in_cnv_regions'),'NA')
file_gene = tl1[htl.index('Gene_name')].lower()
if file_gene in ['NA'.lower(),'none','-']: #this prevents 'NA' intergenic variants from pairing with arbitrary other
continue
gts1 = get_genotypes(tl1 , htl , myaffected, first_sample_index, family_data)
#pos 0 :pass1, pos1: pass 2, pos2: affected status, pos3: first
#line pt_ raw genotype, Pos4:line 1 pt calculated genotype,
#pos5:second line pt_raw genotype, pos6: second line pt calculated genotype
if test:
print '\ntesting gts dictionary output in com_het_res mendfilter'
print 'gts1',gts1
time.sleep(0.2)
#if genename==tl1[htl.index('refseq')]:
# print 'index',tl1[0]
# print 'in com het res function ','k=',k,'v=',v
for k,v in gts1.items():
#print k,v
pt_d1[k]=[int(myaffected[k]['affected']),v[0],v[1]] #v[0]= genotype v[1]= genotype class v[2]=mpg_score v[3]=coverage v[4]=gender
pt_d1[k].append('index=%s'%tl1[htl.index('Index')])
#print pt_d1[k]
#raise Exception
#if genename==tl1[htl.index('refseq')]:print '\n'
linepass1=1
for k,v in pt_d1.items():
if not (((v[0]==1 and
(v[2]=="het_complete" or
v[2]=="var_incomplete")))
or
( v[0]==0 and
(v[2]=="hom_ref" or
v[2]=="ref_incomplete" or
v[2]=="het_complete" ))):
linepass1 = 0
genepass=1
for item in genes_toexclude:
gene=item.strip()
delim=';'
if delim in item:
gene = item.split(delim)[0]
#print 'gene is',gene
if file_gene == gene.lower():
#if genename==tl1[htl.index('refseq')]:
# print "it's a bogus gene therefore it is eliminated from the results"
genepass=0
my_mpg_score = mpg_scores(tl1, htl)
tl1[htl.index('fam_mpg:cov')] = str(my_mpg_score) #add mpg:coverage score to the line
if test:
print 'mpg_score is',my_mpg_score
#if genename==tl1[htl.index('refseq')]:
# print 'mpg_score is',mpg_score
if ( linepass1 and genepass and my_mpg_score>0.5 and
('random' not in tl1[htl.index('Chr')].lower()) ):
# if line meets the Mendelian criteria and the gene is not an
#unimportant gene and mpg/coverage is good (>0.4) in at least half of the individuals and
for line2 in controllines[file_gene]:
pt_d2={}
tl2=line2.strip().split("\t")
tl2.insert(htl.index('fam_mpg:cov'),'-')
tl2.insert(htl.index('FIOFWSMIP'),'-') #Found In Other Family With Same Mendelian Inheritance Pattern
tl2.insert(htl.index('in_bed_regions'),'NA')
tl2.insert(htl.index('in_linkage_regions'),'NA')
tl2.insert(htl.index('in_cnv_regions'),'NA')
mpg_score2 = mpg_scores(tl2, htl) #returns a float mpg_score
tl2[htl.index('fam_mpg:cov')] = str(mpg_score2) #add mpg:coverage score to the line
gts2 = get_genotypes(tl2, htl, myaffected, first_sample_index, family_data)
if test:
print 'second gts',gts2
linepass2 = 1 #reset the pass status for second line
for k,v in gts2.items():
#pt_d2[k]=[int(myaffected[k][4]),v[0],v[1]]#v[0]= v[1]= v[2]=affected status v[4]=genotype v[5]=genotype class
pt_d2[k]=[int(myaffected[k]['affected']),v[0],v[1]]#v[0]= v[1]= v[2]=affected status v[4]=genotype v[5]=genotype class
pt_d2[k].append('index=%s'%tl2[htl.index('Index')])
if test:
print 'gts1',gts1
print 'gts2',gts2
print 'pt_d2[k]',pt_d2.items()
print 'index 1',tl1[htl.index('Index')]
print 'index 2',tl2[htl.index('Index')]
for k,v in pt_d2.items():
if not ((
(v[0]==1 and
(v[2]=="het_complete" or
v[2]=="var_incomplete"))
)
or
(v[0]==0 and
(v[2]=="hom_ref" or
v[2]=="ref_incomplete" or
v[2]=="het_complete" ))
): #these are to check if patients and unaffecteds genotypes are not matching exactly
linepass2=0
else:
pass
#pt_d1[k].append(pt_d2[k])
#if genename==tl1[htl.index('refseq')]:
# print 'index of second line based on gene name before pass',tl2[0]
if (tl1!=tl2 and mpg_score2>0.5 and linepass2):
#line meets the Mendelian criteria and the gene is not an unimportant gene and
#mpg/coverage is good (>0.4) in at least half of the individuals
linespass=1
#print 'all passed, before final check pt_d1=',pt_d1.items()
#print 'all passed, before final check pt_d2=',pt_d2.items()
#time.sleep(3)
#print 'initial linespassed'
#if genename==tl1[htl.index('refseq')]:
# print 'index of second line based on gene name after pass',tl2[0]
for k1,v1 in pt_d1.items():
for k2,v2 in pt_d2.items():
if k1!=k2 and v1[0]!=v2[0]:#for different pweople whose affected status are different as well
if v1[1]==v2[1]:# their genotypes match
linespass=0
elif (pt_d1[k1][1],pt_d2[k1][1]) == (pt_d1[k2][1],pt_d2[k2][1]):
linespass=0
"""
print '\nboth lines passed , not same lines, same gene, same transcript, good mpg scores'
print 'line1passed linepass1=%s genepass=%s mpg_score1=%s'%(linepass1,genepass,mpg_score)
print 'line2passed linepass2=%s mpg_score2=%s'%(linepass1,mpg_score2)
time.sleep(1)
"""
if linespass:
duplicateflag=0
#print '\nall passed, after final check, before write pt_d1=',pt_d1.items()
#print 'all passed, after final check, before write pt_d2=',pt_d2.items()
#time.sleep(1)
#if genename==tl1[htl.index('refseq')]:
# print "\t".join(tl1)+"\n"+"\t".join(tl2)+"\n\n"
# print tl1
# print tl2
# time.sleep(1)
if len(linestosort)>=1:
for x in linestosort:
#print 'x and linestosort index[x]',x,linestosort[x]
#print 'tl1=>',tl1
#print 'tl2=>',tl2
#print '\n'
#time.sleep(1)
if (x[2][htl.index('Index')] , x[3][htl.index('Index')]) == (tl2[htl.index('Index')],tl1[htl.index('Index')]):
#print '\n\nfound duplicate'
#print (linestosort[x][2][htl.index('Index')] , linestosort[x][3][htl.index('Index')]), (tl2[htl.index('Index')],tl1[htl.index('Index')])
#time.sleep(2)
duplicateflag=1
if not duplicateflag:
if kwargs['include_bed_file']!='None':
tl1[htl.index('in_bed_regions')]='-'
tl2[htl.index('in_bed_regions')]='-'
for coord in bed_coords:
if (tl1[htl.index('Chr')]==coord[0] and
int(tl1[htl.index('LeftFlank')]) >= int(coord[1]) and
int(tl1[htl.index('RightFlank')]) <= int(coord[2]) ):
tl1[htl.index('in_bed_regions')]='+'
if (tl2[htl.index('Chr')]==coord[0] and
int(tl2[htl.index('LeftFlank')]) >= int(coord[1]) and
int(tl2[htl.index('RightFlank')]) <= int(coord[2]) ):
tl2[htl.index('in_bed_regions')]='+'
if kwargs['linkage_bed_file']!='None':
tl1[htl.index('in_linkage_regions')]='-'
tl2[htl.index('in_linkage_regions')]='-'
for coord in linkage_bed_coords:
if (tl1[htl.index('Chr')]==coord[0] and
int(tl1[htl.index('LeftFlank')]) >= int(coord[1]) and
int(tl1[htl.index('RightFlank')]) <= int(coord[2]) ):
tl1[htl.index('in_linkage_regions')]='+'
if (tl2[htl.index('Chr')]==coord[0] and
int(tl2[htl.index('LeftFlank')]) >= int(coord[1]) and
int(tl2[htl.index('RightFlank')]) <= int(coord[2]) ):
tl2[htl.index('in_linkage_regions')]='+'
if kwargs['cnv_bed_file']!='None':
tl1[htl.index('in_cnv_regions')]='-'
tl2[htl.index('in_cnv_regions')]='-'
for coord in cnv_bed_coords:
if (tl1[htl.index('Chr')]==coord[0] and
int(tl1[htl.index('LeftFlank')]) >= int(coord[1]) and
int(tl1[htl.index('RightFlank')]) <= int(coord[2]) ):
tl1[htl.index('in_cnv_regions')]='+'
if (tl2[htl.index('Chr')]==coord[0] and
int(tl2[htl.index('LeftFlank')]) >= int(coord[1]) and
int(tl2[htl.index('RightFlank')]) <= int(coord[2]) ):
tl2[htl.index('in_cnv_regions')]='+'
penaltyscore1 = penalty_score(tl1,htl,cdnegfile,'compound-heterozygous', project_code)
tl1[htl.index('varscore')] = str(penaltyscore1[0])
penaltyscore2 = penalty_score(tl2,htl,cdnegfile,'compound-heterozygous', project_code)
tl2[htl.index('varscore')]= str(penaltyscore2[0])
if 'CDPred_score' in htl:
if tl1[htl.index('CDPred_score')]=='-':
tl1[htl.index('CDPred_score')]= '301'
if tl2[htl.index('CDPred_score')]=='-':
tl2[htl.index('CDPred_score')]= '301'
linestosort.append(
[(float(tl1[htl.index('varscore')]) + float(tl2[htl.index('varscore')])),
min( int(tl1[htl.index('CDPred_score')]) , int(tl2[htl.index('CDPred_score')]) ),
tl1,
tl2])
elif not ('CDPred_score' in htl) and args.nextgene_data:
linestosort.append(
[(float(tl1[htl.index('varscore')]) + float(tl2[htl.index('varscore')])),
'301',
tl1,
tl2])
else:
raise Exception
linespass=0
if i % 1000 == 0:
sys.stderr.write("found a pair: progress in file %s is %.5s %%\n"%(cdnegfile , i/numlines *100))
sys.stderr.write("elapsed time = %.5s min\n"%((time.time() - start)/60))
#print 'linestosort',linestosort
#linespass=0 #reset linepass
# for kayit in linestosort:
# resultdict = mydb.check_and_add_db(htl=htl,
# variantline1 = kayit[2],
# variantline2 = kayit[3],
# project_code = project_code,
# mendtype = 'compound_heterozygote'
# )
# fiof_penalty=0
# if resultdict['founddifferentflag'] == True:
# #print 'found same variant pair in different family==>',resultdict['founddifferentrow']
# #print 'lines in this family are line1=',tl1
# #print 'lines in this family are line2=',tl2
# for result in resultdict['founddifferentrow']:
# kayit[2][htl.index('FIOFWSMIP')]+='%s,'%result[1]
# kayit[3][htl.index('FIOFWSMIP')]+='%s,'%result[1]
# fiof_penalty-=1000
# kayit[0]+=fiof_penalty
sorted_lines = sorted(linestosort, key=lambda item:float(item[0]), reverse=True)
for item in sorted_lines:
item[2][htl.index('varscore')] = str(item[0])
item[3][htl.index('varscore')] = str(item[0])
com_het_res.append("\t".join(item[2])+"\n")
com_het_res.append("\t".join(item[3])+"\n")
com_het_res_path='%s/%s.com_het_res'%(os.path.dirname(cdnegfile),os.path.basename(cdnegfile))
write_lines(com_het_res_path,com_het_res)
print 'Mendelian compound het analysis output:{}'.format(com_het_res_path)
# write file stats
fstats=open("%s/analysis_stats.txt"%os.path.dirname(cdnegfile),"a")
fstats.write("The number of pairs for compound heterozygous variants are:\t%s "%((len(com_het_res)-1)/2) +"\n" )
end=time.time()
print "total function elapsed time=",end-start
#removes the temp directory created during execution
tmpdir='%s/%s_tmp'%(os.path.dirname(cdnegfile),os.path.basename(cdnegfile))
if os.path.exists(tmpdir):
print 'don\'t forget to delete the directory by enabling the rmtree function'
#shutil.rmtree(tmpdir)
def annotate_seattle_seq_snv(inputfile, outputfile):
"""
SeattleSEQ ANNOTATIONS
--- inDBSNPOrNot (whether the variation is in the dbSNP database and-or
1000 Genomes; either echoing the user input in the case of a Maq file,
or with no user input, whether represented in these datasets)
--- chromosome (input from the user)
--- position (input from user, location on the chromosome, hg19, 1-based)
--- referenceBase (input from the user or calculated if not provided)
--- sampleGenotype (input from the user)
--- accession (NCBI or CCDS transcript identifier)
--- functionGVS (GVS class of variation function, using only hg19 and
your submitted alleles; see description)
--- functionDBSNP (dbSNP class of variation function)
--- rsID (dbSNP identifier for the variation)
--- aminoAcids (list of amino acids for the codon, starting with that of
the reference base)
--- proteinPosition (the position of the amino acid in the protein,
beginning at the N-terminal with the first amino acid at position 1,
followed by the total number of amino acids in the protein;
the total includes a count for the stop codon)
--- Alleles Submitted (column sampleAlleles: same information as sampleGenotype, with the ambiguity code resolved into two alleles; if there are multiple individuals, this is a list of all alleles present)
--- Genotype in dbSNP (column dbSNPGenotype: the genotype in dbSNP for a particular individual; N/N if not available,
X/X if conflicting measurements; this column is available only if a numerical dbSNP individual ID is entered on the query form)
--- Alleles in dbSNP (column allelesDBSNP: list of alleles for all populations and individuals in the GVS database; if the variation is in dbSNP, but has no genotypes for individuals, will be "NA")
--- Conservation Score phastCons (column scorePhastCons: UCSC, 46 placental mammalian species, range of 0 to 1, with 1 being the most conserved)
--- Conservation Score GERP (column consScoreGERP: rejected-substitution score from the program GERP, Stanford University, 34 mammalian species, range of -11.6 to 5.82, with 5.82 being the most conserved)
--- Chimp Allele (column chimpAllele: UCSC alignments)
--- Copy Number Variations (column CNV: Toronto database)
--- Genes (column geneList: HUGO names, any for which the transcription region overlaps the variation)
--- HapMap Frequencies (3 columns AfricanHapMapFreq, EuropeanHapMapFreq, AsianHapMapFreq: African, European, and Asian, in percent)
--- Has Genotypes (column hasGenotypes: whether GVS, and thus usually dbSNP, has genotypes available for the variation)
--- dbSNP Validation (column dbSNPValidation: dbSNP validation status codes, dealing with e.g. whether the variation has been seen at least twice)
--- Repeats (2 columns repeatMasker and tandemRepeat: NA if the variation is not in the GVS database)
--- Grantham Score (column granthamScore: the Grantham score of any amino acid changes, as per Grantham (1974) Science, Table 2)
--- microRNAs (column microRNAs: EMBL IDs of any microRNAs in which the variation is found)
--- Protein Sequence (column proteinSequence: sequence of amino acids for the overlapping transcript if applicable)
--- PolyPhen Prediction (column polyPhen: amino acid substitution impacts, more information below)
--- Clinical Association (column clinicalAssociation: links to NCBI pages, more information below)
--- Distance to Nearest Splice Site (column distanceToSplice: how close the variation is to a splice site, more information below)
--- cDNA Position (column cDNAPosition: the location within the sequence of coding bases; NA if not coding; so far only SNPs)
assert headerindel1==headerindel2, "The two files do not have the same headers,please use the same settings for seattleseq output"
"""
IUB_codes={'A':'A','G':'G','C':'C','T':'T',
'R':'AG','Y':'CT','K':'GT','M':'AC',
'S':'GC','W':'AT','N':'N'}
fw=open(outputfile, 'w')
import gzip
myfile = open(inputfile)
#print 'Input file is %s lines'%len(inlines)
sys.stderr.write('Seattle_seq annotation started for %s\n'%outputfile)
htl = myfile.readline().strip().split('\t')
ss_dict = {}
ss_file_flag=0
for file_ in glob.glob(os.path.join(mycfg.seattleseq_dir_path,'*')):
if 'SeattleSeqAnnotation' in file_ and 'ss_snv' in file_:
ss_file_flag+=1
ss_lines=gzip.open(file_,'rb').readlines()
shtl=ss_lines[0].strip().split('\t')
shtl.reverse()#so the columns appear in correct order in the final file
for item in shtl:
if not 'ss_%s'%item in htl:
htl.insert(0,'ss_{0}'.format(item))
shtl.reverse()#take the list back to it's normal order
fw.write('\t'.join(htl)+'\n')
for ss_line in ss_lines[1:]:
stl = ss_line.strip().split('\t')
if (not ss_line.startswith('#')):
mykey = (stl[shtl.index('chromosome')],stl[shtl.index('position')])
ss_dict[mykey] = {}
for item in shtl:
ss_dict[mykey][item] = stl[shtl.index(item)]
for line in myfile:
tl = line.strip().split('\t')
for i in range(len(htl)-len(tl)):
tl.insert(0,'-')
if tl[htl.index('hg19LeftFlank')] != '-':
mylookupkey = (tl[htl.index('Chr')].replace('chr',''),str(int(tl[htl.index('hg19LeftFlank')])+1))
#print mylookupkey
#raise Exception
if ss_dict.get(mylookupkey):
for item in shtl:
tl[htl.index('ss_{0}'.format(item))] = ss_dict.get(mylookupkey)[item]
if 'refseq' in htl:
if tl[htl.index('refseq')]=='NA' and ss_dict.get(mylookupkey)['geneList']!='none':
tl[htl.index('refseq')]= ss_dict.get(mylookupkey)['geneList']
fw.write('\t'.join(tl)+'\n')
fw.close()
if not ss_file_flag:
shutil.copyfile(inputfile, outputfile)
sys.stderr.write("""SeattleSeq annotation file was not found in the input directory
continuing without seattle seq annotations
""")
elif ss_file_flag > 1:
sys.stderr.write("""More than one SeattleSeq annotation file was found in the input directory
please check the files and make sure the annotation files are correct""")
def annotate_seattle_seq_indel(inputfile , outputfile):
IUB_codes={'A':'A','G':'G','C':'C','T':'T',
'R':'AG','Y':'CT','K':'GT','M':'AC',
'S':'GC','W':'AT','N':'N'}
fw=open(outputfile,'w')
import gzip
myfile = open(inputfile)
#print 'Input file is %s lines'%len(inlines)
htl = myfile.readline().strip().split('\t')
ss_file_flag=0
ss_dict = {}
for file_ in glob.glob(os.path.join(mycfg.seattleseq_dir_path,'*')):
if 'SeattleSeqAnnotation' in file_ and 'ss_indel' in file_:
ss_file_flag=1
ss_lines=gzip.open(file_,'rb').readlines()
#shtl=ss_lines[0].lstrip('# ').strip().split('\t')
shtl=ss_lines[0].strip().split('\t')
shtl.reverse()#so the columns appear in correct order in the final file
for item in shtl:
if not 'ss_%s'%item in htl:
htl.insert(0,'ss_{0}'.format(item))
shtl.reverse()#take the list back to it's normal order
fw.write('\t'.join(htl)+'\n')
for ss_line in ss_lines[1:]:
stl=ss_line.strip().split('\t')
if not ss_line.startswith('#'):
mykey = (stl[shtl.index('chromosome')],stl[shtl.index('position')])
ss_dict[mykey] = {}
for item in shtl:
ss_dict[mykey][item] = stl[shtl.index(item)]
for line in myfile:
tl = line.strip().split('\t')
for i in range(len(htl)-len(tl)):
tl.insert(0,'-')
if tl[htl.index('hg19LeftFlank')] != '-':
mylookupkey = (tl[htl.index('Chr')].replace('chr',''),tl[htl.index('hg19LeftFlank')])
if ss_dict.get(mylookupkey):
for item in shtl:
tl[htl.index('ss_{0}'.format(item))] = ss_dict.get(mylookupkey)[item]
if 'refseq' in htl:
if tl[htl.index('refseq')]=='NA' and ss_dict.get(mylookupkey)['geneList']!='none':
tl[htl.index('refseq')]= ss_dict.get(mylookupkey)['geneList']
fw.write('\t'.join(tl)+'\n')
fw.close()
if not ss_file_flag:
shutil.copyfile(inputfile ,outputfile)
sys.stderr.write("""SeattleSeq annotation file was not found in the input directory
continuing without seattle seq annotations
""")
def annotate_with_dbsnp(file_, output_file):
"""
works on myutils.coords19 BASED varsifter files that has hg19 annotations
adds dbsnp137 data to the file
"""
sys.stderr.write('NOTICE: varmd(annotate_with_dbsnp) started\n')
test=0
verbose=0
if verbose:
print 'file',file_
try:
expression_snps = [x.strip() for x in open(mycfg.expression_snps_path).readlines()]
except Exception:
raise Exception
headers_to_add =[
'Homozygous_VarAllele_UDP_count',
'Single_VarAllele_UDP_count',
'all_mpg:coverage_score',
'dbsnp137G5',
'dbsnp137ClinicalSignificance',
'dbsnp137GMAF',
'dbsnp137RS#',
'dbsnp137',
'predicted_expression_snp',
'varscore',
]
fwlog=open('%s_log'%output_file,'w')
myfile1 = open(file_)
htl = myfile1.readline().strip().split('\t')
file_already_has_dbsnp=True
for htl_ in headers_to_add:
if htl_ not in htl:
file_already_has_dbsnp = False
if file_already_has_dbsnp:
os.system('cp %s %s'%(file_, output_file))
fwlog.write('\n\nFile already has dbsnp data, please continue analysis\n')
elif not file_already_has_dbsnp:
myfile1 = open(file_)
if not 'hg19LeftFlank' in htl:
print 'hg19LeftFlank is not in htl'
print 'htl :', htl
raise Exception
fw = open(output_file,'w')
fwlog = open('%s_log'%output_file,'w')
writtenlinescounter=0
counter=0
#time.sleep(10)
htl = myfile1.readline().strip().split('\t')
for htl_ in headers_to_add:
htl.insert(0,htl_)
hline='\t'.join(htl)+'\n'
fw.write(hline)
linescounter=0
vcfadded=0
mykg_table = Table('kgdict', metadata, autoload=True)
chrindex = htl.index('Chr')
hg19LeftFlankIndex = htl.index('hg19LeftFlank')
for chr in chrs:
print chr
myselect = mykg_table.select().where(mykg_table.c.Chr == chr.replace('chr',''))
#rs = myselect.execute().fetchall()
rs = myselect.execute()
mydict = {}
for x in rs:
mydict[(x[1],str(x[2]-1))]= x[3:]
myfile1 = open(file_)
myfile1.readline()
chrlinecounter=0
for line in myfile1:
#if chrlinecounter > 20000:
# break
line = unicode(line, 'utf-8')
tl=line.strip().split('\t')
tl.insert(htl.index('varscore'),'0')
tl.insert(1,'-')
tl.insert(2,'-')
tl.insert(3,'-')
tl.insert(4,'-')
tl.insert(5,'-')
tl.insert(6,'-')
tl.insert(7,'-')
tl.insert(8,'-')
tl.insert(9,'-')
if test:
print tl
time.sleep(1)
vcffound=0
counter+=1
#verbose=True
if verbose and (counter%10)==0:
print 'progress is %f' %counter
#if tl[hg19LeftFlankIndex] in ['13301', '13326', '13979']:
# print "\ntest case"
# print 'tl',tl[:20]
# print 'mydict',mydict.get((tl[chrindex].replace('chr',''),tl[hg19LeftFlankIndex]))
if tl[chrindex]== chr:
if mydict.get((tl[chrindex].replace('chr',''), tl[hg19LeftFlankIndex])):
vcffound+=1
gmafscore=''
scsclass=''
mysnpdata = mydict.get((tl[chrindex].replace('chr',''), tl[hg19LeftFlankIndex]))
kgvcftemp = mysnpdata[5].split(';')
snp_name = mysnpdata[0]
#print 'original dict entry', mydict.get((tl[chrindex].replace('chr',''), tl[hg19LeftFlankIndex]))
#print 'kgvcftem', kgvcftemp
#print 'snp_name', snp_name
#time.sleep(1)
for item in kgvcftemp:
if item.startswith('VP'):
kgvcftemp.remove(item)
if item=='G5':
tl[htl.index('dbsnp137G5')]='+' #">5% minor allele frequency in 1+ populations"
kgvcftemp.remove(item)
elif item.startswith('GMAF'):
gmafscore = item.split('=')[1]
kgvcftemp.remove(item)
elif item.startswith('SCS'):
scs = item.split('=')[1]
if scs=='0':
scsclass='unknown'
elif scs=='1':
scsclass='untested'
elif scs=='2':
scsclass='non-pathogenic'
elif scs=='3':
scsclass='probable-non-pathogenic'
elif scs=='4':
scsclass='probable-pathogenic'
elif scs=='5':
scsclass='pathogenic'
elif scs=='6':
scsclass='drug-response'
elif scs=='7':
scsclass='histocompatibility'
elif scs=='255':
scsclass='other'
kgvcftemp.remove(item)
#print 'final kgvcftemp',kgvcftemp
#time.sleep(10)
tl[htl.index('dbsnp137')]=','.join(kgvcftemp)
tl[htl.index('dbsnp137RS#')] = snp_name
if snp_name in expression_snps: #adds snps that are found in expression modifier regions
tl[htl.index('predicted_expression_snp')] = '+'
tl[htl.index('dbsnp137GMAF')] = gmafscore #"Global Minor Allele \
#Frequency [0, 0.5]; global population is 1000Genomes phase 1 genotype data from 629 individuals, released in the 08-04-2010 dataset"
tl[htl.index('dbsnp137ClinicalSignificance')] = scsclass
vcfadded+=1
try:
newline = '\t'.join(tl)+'\n'
newline = newline.encode('ascii', 'ignore')
fw.write(newline)
except UnicodeDecodeError as e:
print "there was an error with decoding the line content"
print tl
print e
raise Exception
if vcfadded:
fwlog.write('\n\nOut of a total of %s lines, %s lines were annotated with dbsnp137 annotation \n'%(linescounter,vcfadded))
fw.close()
endtime=time.time()
duration=((endtime-startall)/60)
fwlog.write('print "it took me %.2f minutes to complete the operation' %duration)
fwlog.close()
#send mail with primitive mail function that works in biowulf where smtp doesn't work.
#if int(duration)>180:
# sendmail()
def annotate_with_ESP(file_, output_file):
"""
works on myutils.coords19 BASED varsifter files that has hg19 annotations
adds ESP_variants from mysql database to the VAR-MD file
"""
sys.stderr.write('NOTICE: varmd(annotate_with_ESP_variants) started\n')
test=0
verbose=0
if verbose:
print 'file',file_
headers_to_add =[
'ESP_info',
'ESP_TAC',
'ESP_AA_AC',
'ESP_EA_AC',
'ESP_alt',
'ESP_ref',
]
fwlog=open('%s_log'%output_file,'w')
myfile1 = open(file_)
htl = myfile1.readline().strip().split('\t')
file_already_has_annotations = True
for htl_ in headers_to_add:
if htl_ not in htl:
file_already_has_annotations = False
if file_already_has_annotations:
os.system('cp %s %s'%(file_, output_file))
fwlog.write('\n\nFile already has ESP annotations , please continue analysis\n')
elif not file_already_has_annotations:
myfile1 = open(file_)
if not 'hg19LeftFlank' in htl:
print 'hg19LeftFlank is not in htl'
print 'htl :', htl
raise Exception
fw = open(output_file,'w')
fwlog = open('%s_log'%output_file,'w')
writtenlinescounter=0
counter=0
#time.sleep(10)
htl = myfile1.readline().strip().split('\t')
for htl_ in headers_to_add:
htl.insert(0,htl_)
hline='\t'.join(htl)+'\n'
fw.write(hline)
linescounter=0
vcfadded=0
my_table = Table('ESP_variants', metadata, autoload=True)
chrindex = htl.index('Chr')
hg19LeftFlankIndex = htl.index('hg19LeftFlank')
for chr in chrs:
print chr
myselect = my_table.select().where(my_table.c.Chr == chr.replace('chr',''))
#rs = myselect.execute().fetchall()
rs = myselect.execute()
mydict = {}
for x in rs:
mydict[(x[1],str(x[2]-1))]= x[3:]
myfile1 = open(file_)
myfile1.readline()
chrlinecounter=0
for line in myfile1:
#if chrlinecounter > 20000:
# break
line = unicode(line, 'utf-8')
tl=line.strip().split('\t')
tl.insert(0,'-')
tl.insert(0,'-')
tl.insert(0,'-')
tl.insert(0,'-')
tl.insert(0,'-')
tl.insert(0,'-')
if test:
print tl
time.sleep(1)
vcffound=0
counter+=1
#verbose=True
if verbose and (counter%10)==0:
print 'progress is %f' %counter
#if tl[hg19LeftFlankIndex] in ['13301', '13326', '13979']:
# print "\ntest case"
# print 'tl',tl[:20]
# print 'mydict',mydict.get((tl[chrindex].replace('chr',''),tl[hg19LeftFlankIndex]))
if tl[chrindex]== chr:
if mydict.get((tl[chrindex].replace('chr',''), tl[hg19LeftFlankIndex])):
vcffound+=1
gmafscore=''
scsclass=''
myresult = mydict.get((tl[chrindex].replace('chr',''), tl[hg19LeftFlankIndex]))
kgvcftemp = myresult[9].split(';')
#print 'original dict entry', mydict.get((tl[chrindex].replace('chr',''), tl[hg19LeftFlankIndex]))
#print 'kgvcftem', kgvcftemp
#print 'snp_name', snp_name
#time.sleep(1)
#print 'final kgvcftemp',kgvcftemp
#time.sleep(10)
tl[htl.index('ESP_ref')]=myresult[1]
tl[htl.index('ESP_alt')]=myresult[2]
tl[htl.index('ESP_EA_AC')]=myresult[6]
tl[htl.index('ESP_AA_AC')]=myresult[7]
tl[htl.index('ESP_TAC')]=myresult[8]
tl[htl.index('ESP_info')]= ';'.join(kgvcftemp)
vcfadded+=1
try:
newline = '\t'.join(tl)+'\n'
newline = newline.encode('ascii', 'ignore')
fw.write(newline)
except UnicodeDecodeError as e:
print "there was an error with decoding the line content"
print tl
print e
raise Exception
if vcfadded:
fwlog.write('\n\nOut of a total of %s lines, %s lines were annotated with dbsnp137 annotation \n'%(linescounter,vcfadded))
fw.close()
endtime=time.time()
duration=((endtime-startall)/60)
fwlog.write('print "it took me %.2f minutes to complete the operation' %duration)
fwlog.close()
#send mail with primitive mail function that works in biowulf where smtp doesn't work.
#if int(duration)>180:
# sendmail()
def annotate_with_allfams(myinfile,myoutfile):
"""INPUT: 1000 genomes/dbsnp annotated files
adds genotype frequency data from UDP cohort allsamples file
"""
sys.stderr.write('varmd (annotate_with_allfams) started for %s \n'%myinfile)
myfile1 = open(myinfile)
fw=open(myoutfile,'w')
fwlog=open('{0}_log'.format(myoutfile),'a')
try:
conn = sqlite3.connect(mycfg.population_db_path)
c=conn.cursor()
except Exception, e:
print """WARNING: There was a problem with opening the population database
Please refer to annotate_with_allfams """
print 'the exception is',e
hline=myfile1.readline()
htl= hline.rstrip().split('\t')
fw.write(hline)
fwlog.write('Positions where no corresponding data from population file could be found\n')
fwlog.write('Index\t\tChr\t\thg18LeftFlank\t\thg19LEftFlank\n')
linecounter=0
for line in myfile1:
linecounter+=1
found_unique_flag=0
found_multiple_flag=0
tl=line.strip().split('\t')
mykey=(tl[htl.index('Chr')],
tl[htl.index('hg19LeftFlank')],
tl[htl.index('hg19RightFlank')],
#tl[htl.index('muttype')],
tl[htl.index('var_allele')])
rc = c.execute('''
select * from popdb where chr=? and
hg19LeftFlank=? and
hg19RightFlank=? and
var_allele=?''',mykey)
rs = rc.fetchall()
if not rs:
try:
leftflankpos = tl[htl.index('LeftFlank')]
except:
leftflankpos = "None"
fwlog.write('%s\n'%'\t'.join([tl[htl.index('Index')],
tl[htl.index('Chr')],
leftflankpos,
tl[htl.index('hg19LeftFlank')]])
)
fw.write('\t'.join(tl)+'\n')
elif len(rs)==1:
#for x in rs[0]:
# print x, rs[0].index(x)
#raise Exception
#print rs[0][5].split(';;')
tafl = {x.split('::')[0]:x.split('::')[1] for x in rs[0][5].split(';;')}
tl[htl.index('Homozygous_VarAllele_UDP_count')]='{0}'.format(tafl['var_gt_count'].rjust(4))
tl[htl.index('Single_VarAllele_UDP_count')]='{0}'.format(tafl['var_allele_count'].rjust(4))
tl[htl.index('all_mpg:coverage_score')]='{0}'.format(tafl['all_mpg:coverage_score'].rjust(4))
found_unique_flag=1
fw.write('\t'.join(tl)+'\n')
else:
print '%s records found'%len(rs)
print rs[0][:5]
print rs[1][:5]
tafl = {x.split('::')[0]:x.split('::')[1] for x in rs[0][5].split(';;')}
tl[htl.index('Homozygous_VarAllele_UDP_count')]='{0}'.format(tafl['var_gt_count'].rjust(4))
tl[htl.index('Single_VarAllele_UDP_count')]='{0}'.format(tafl['var_allele_count'].rjust(4))
tl[htl.index('all_mpg:coverage_score')]='{0}'.format(tafl['all_mpg:coverage_score'].rjust(4))
found_multiple_flag=1
fw.write('\t'.join(tl)+'\n')
if linecounter%20000==0:
print linecounter
endtime=time.time()
duration=((endtime-startall)/60)
fwlog.write('print "it took me %.2f minutes to complete the operation' %duration)
fw.close()
fwlog.close()
def annotate_with_allfams_mysql(myinfile,myoutfile):
"""INPUT: dbsnp annotated file
adds genotype frequency data from UDP cohort allsamples database in mysql
"""
sys.stderr.write('varmd (annotate_with_allfams) started for %s \n'%myinfile)
myfile1 = open(myinfile)
fw=open(myoutfile,'w')
fwlog=open('{0}_log'.format(myoutfile),'a')
try:
conn = sqlite3.connect(mycfg.population_db_path)
c=conn.cursor()
except Exception, e:
print """WARNING: There was a problem with opening the population database
Please refer to annotate_with_allfams """
print 'the exception is',e
hline=myfile1.readline()
htl= hline.rstrip().split('\t')
fw.write(hline)
fwlog.write('Positions where no corresponding data from population file could be found\n')
fwlog.write('Index\t\tChr\t\thg18LeftFlank\t\thg19LEftFlank\n')
linecounter=0
for line in myfile1:
linecounter+=1
found_unique_flag=0
found_multiple_flag=0
tl=line.strip().split('\t')
mykey=(tl[htl.index('Chr')],
tl[htl.index('hg19LeftFlank')],
tl[htl.index('hg19RightFlank')],
#tl[htl.index('muttype')],
tl[htl.index('var_allele')])
rc = c.execute('''
select * from popdb where chr=? and
hg19LeftFlank=? and
hg19RightFlank=? and
var_allele=?''',mykey)
rs = rc.fetchall()
if not rs:
try:
leftflankpos = tl[htl.index('LeftFlank')]
except:
leftflankpos = "None"
fwlog.write('%s\n'%'\t'.join([tl[htl.index('Index')],
tl[htl.index('Chr')],
leftflankpos,
tl[htl.index('hg19LeftFlank')]])
)
fw.write('\t'.join(tl)+'\n')
elif len(rs)==1:
#for x in rs[0]:
# print x, rs[0].index(x)
#raise Exception
#print rs[0][5].split(';;')
tafl = {x.split('::')[0]:x.split('::')[1] for x in rs[0][5].split(';;')}
tl[htl.index('Homozygous_VarAllele_UDP_count')]='{0}'.format(tafl['var_gt_count'].rjust(4))
tl[htl.index('Single_VarAllele_UDP_count')]='{0}'.format(tafl['var_allele_count'].rjust(4))
tl[htl.index('all_mpg:coverage_score')]='{0}'.format(tafl['all_mpg:coverage_score'].rjust(4))
found_unique_flag=1
fw.write('\t'.join(tl)+'\n')
else:
print '%s records found'%len(rs)
print rs[0][:5]
print rs[1][:5]
tafl = {x.split('::')[0]:x.split('::')[1] for x in rs[0][5].split(';;')}
tl[htl.index('Homozygous_VarAllele_UDP_count')]='{0}'.format(tafl['var_gt_count'].rjust(4))
tl[htl.index('Single_VarAllele_UDP_count')]='{0}'.format(tafl['var_allele_count'].rjust(4))
tl[htl.index('all_mpg:coverage_score')]='{0}'.format(tafl['all_mpg:coverage_score'].rjust(4))
found_multiple_flag=1
fw.write('\t'.join(tl)+'\n')
if linecounter%20000==0:
print linecounter
endtime=time.time()
duration=((endtime-startall)/60)
fwlog.write('print "it took me %.2f minutes to complete the operation' %duration)
fw.close()
fwlog.close()
def create_allsamples_db():
"""
Generates a sqlite3 database using the data in the Allsamples file
the path to input data comes from config.population_file
The path to database file comes from configuration.population_db_path
"""
myfile = varfile(open(mycfg.population_file).readline().strip().split('\t') )
patient_pos=myfile.first_sample_index
fr = open(mycfg.population_file)
headerline = fr.readline()
htl = headerline.strip().split('\t')
htl.insert(0,'var_allele_count')
htl.insert(0,'ref_allele_count')
htl.insert(0,'var_gt_count')
htl.insert(0,'ref_gt_count')
htl.insert(0,'all_mpg:coverage_score')
qsubscratchpath='/scratch/varmd/'
scratchflag = 0
if os.path.exists(qsubscratchpath):
mydatabasepath = os.path.join(qsubscratchpath ,'population_sqlite3.db')
scratchflag = 1
else:
mydatabasepath = mycfg.population_db_path
conn = sqlite3.connect(mydatabasepath)
print 'database is opened at %s'%mydatabasepath
c=conn.cursor()
c.execute('''
create table IF NOT EXISTS popdb (chr INTEGER,
hg19LeftFlank INTEGER,
hg19RightFlank INTEGER,
muttype,
var_allele,
data,
var_allele_count,
ref_allele_count,
var_gt_count,
ref_gt_count,
all_mpg_coverage_score
)''')
linecounter=0
duplicates=0
fr = open(mycfg.population_file)
fr.readline()
for line in fr:
line = unicode(line, 'utf-8')
linecounter+=1
#print line;time.sleep(0.5)
tl=line.strip().split('\t')
tl.insert(0,'-')
tl.insert(0,'-')
tl.insert(0,'-')
tl.insert(0,'-')
tl.insert(0,'-')
newline = varline(tl,htl)
if not (tl[ htl.index('hg19LeftFlank')].isdigit() and
tl[ htl.index('hg19RightFlank')].isdigit()):
pass
else:
tl[htl.index('var_allele_count')]=str(newline.var_allele_count)
tl[htl.index('ref_allele_count')]=str(newline.ref_allele_count)
tl[htl.index('var_gt_count')]=str(newline.var_gt_count)
tl[htl.index('ref_gt_count')]=str(newline.ref_gt_count)
tl[htl.index('all_mpg:coverage_score')]=str(newline.mpg_coverage_score)
#DO NOT USE REFSEQ as part of the key as it creates errors
#when the same position is annotated with different gene names
# there are a few indel based multiple variants with this key combination
#this is currently not handled by var-md
#work on this later on
myentry = ['%s::%s'%(htl[i],tl[i]) for i in range(len(tl))]
#for each item in the line add column header as key and item as value
c.execute('''insert into popdb values(?,?,?,?,?,?,?,?,?,?,?)''',(
tl[htl.index('Chr')],
tl[htl.index('hg19LeftFlank')],
tl[htl.index('hg19RightFlank')],
tl[htl.index('muttype')],
tl[htl.index('var_allele')],
';;'.join(myentry),
str(newline.var_allele_count),
str(newline.ref_allele_count),
str(newline.var_gt_count),
str(newline.ref_gt_count),
str(newline.mpg_coverage_score),
))
if linecounter%10000 == 0:
conn.commit() #it is very important to commit regularly, if you don't commit
#every so often the roll back journal gets too big and halts the script and brings it to its knees
if linecounter%30000 == 0:
print linecounter
#result=c.execute('select * from kgdict limit 10''')
#print result.fetchone()
#time.sleep(2)
print 'database created, now creating index'
c.execute('CREATE INDEX idx_popdb ON popdb(chr,hg19LeftFlank,hg19RightFlank)')
conn.commit()
c.close()
print ('database created')
print 'finished processing %s'%mycfg.population_file
print 'Total %s lines had duplicates'%duplicates
print 'new files can be found under %s directory'%os.path.dirname(mycfg.population_file)
if scratchflag ==1:
os.system('cp %s %s'%(mydatabasepath, mycfg.population_db_path))
os.system('rm %s'%(mydatabasepath))
sendmail()
def create_allsamples_db_mysql():
"""
Generates a database using the data in the Allsamples file
the path to input data comes from config.population_file
The path to database file comes from configuration.population_db_path
"""
#print 'please edit the table name for a new input file'
#raise Exception
popdb_table = Table(
'popdb_table_06_02',metadata,
Column('myindex',Integer,primary_key=True),
Column('NISC_Index',Integer,nullable=False),
Column('Chr',String(10),nullable=False),
Column('LeftFlank',Integer,nullable=False),
Column('RightFlank',Integer,nullable=False),
Column('hg19LeftFlank',Integer,nullable=False),
Column('hg19RightFlank',Integer,nullable=False),
Column('ref_allele',String(300),nullable=False),
Column('var_allele',String(300),nullable=False),
Column('muttype',String(30),nullable=False),
Column('variant_type',String(40),nullable=False),
Column('var_allele_count',Integer,nullable=False),
Column('ref_allele_count',Integer,nullable=False),
Column('var_gt_count',Integer,nullable=False),
Column('ref_gt_count',Integer,nullable=False),
Column('all_mpg_coverage_score',Integer,nullable=False),
Column('Gene_name',String(500),nullable=False),
Column('loc_type',String(100),nullable=False),
Column('consequence',String(1000),nullable=False),
Column('All_Gahlc_genotypes_NISC',Integer,nullable=False),
Column('All_Gahlc_homref_NISC',Integer,nullable=False),
Column('All_Gahlc_het_NISC',Integer,nullable=False),
Column('All_Gahlc_homvar_NISC',Integer,nullable=False),
Column('All_Gahlc_hetnonref_NISC',Integer,nullable=False),
Column('All_Gahlc_other_NISC',Integer,nullable=False),
Column('All_Gahlc_refallele_NISC',Integer,nullable=False),
Column('All_Gahlc_varallele_NISC',Integer,nullable=False),
Column('All_Gahlfreq_homref_NISC',Float(precision=4),nullable=False),
Column('All_Gahlfreq_het_NISC',Float(precision=4),nullable=False),
Column('All_Gahlfreq_homvar_NISC',Float(precision=4),nullable=False),
Column('All_Gahlfreq_hetnonref_NISC',Float(precision=4),nullable=False),
Column('All_Gahlfreq_refallele_NISC',Float(precision=4),nullable=False),
Column('All_Gahlfreq_varallele_NISC',Float(precision=4),nullable=False),
Column('All_Gahlref_is_minor_NISC',Integer ,nullable=False),
Column('All_Gahlc_minor_NISC', Integer ,nullable=False),
Column('All_Gahlmaf_NISC',Float(precision=4),nullable=False),
Column('All_Gahlchisquare_NISC',Float(precision=4),nullable=False),
Column('CS_c_genotypes_NISC', Integer ,nullable=False),
Column('CS_c_homref_NISC', Integer ,nullable=False),
Column('CS_c_het_NISC', Integer ,nullable=False),
Column('CS_c_homvar_NISC', Integer ,nullable=False),
Column('CS_c_hetnonref_NISC', Integer ,nullable=False),
Column('CS_c_other_NISC', Integer ,nullable=False),
Column('CS_c_refallele_NISC', Integer ,nullable=False),
Column('CS_c_varallele_NISC', Integer ,nullable=False),
Column('CS_freq_homref_NISC', Float(precision=4) ,nullable=False),
Column('CS_freq_het_NISC', Float(precision=4) ,nullable=False),
Column('CS_freq_homvar_NISC', Float(precision=4) ,nullable=False),
Column('CS_freq_hetnonref_NISC', Float(precision=4) ,nullable=False),
Column('CS_freq_refallele_NISC', Float(precision=4) ,nullable=False),
Column('CS_freq_varallele_NISC', Float(precision=4) ,nullable=False),
Column('CS_ref_is_minor_NISC', Integer ,nullable=False),
Column('CS_c_major_NISC', Integer ,nullable=False),
Column('CS_c_minor_NISC', Integer ,nullable=False),
Column('CS_maf_NISC', Float(precision=4) ,nullable=False),
Column('CS_chisquare_NISC', Float(precision=4) ,nullable=False),
Column('transcript', String(100) ,nullable=False),
Column('ref_aa', String(100) ,nullable=False),
Column('var_aa', String(100) ,nullable=False),
Column('aa_pos', String(100) ,nullable=False),
Column('CDPred_score', String(100) ,nullable=False),
Column('RS_number', String(100) ,nullable=False),
Column('HGMDids', String(100) ,nullable=False),
Column('HGMDdisease', String(300) ,nullable=False),
Column('HGMDtags', String(100) ,nullable=False),
Column('HGMD_HGVS', String(100) ,nullable=False),
Column('REF_MATCHES_HGMD_DISEASE_ALLELE', String(100) ,nullable=False),
Column('HGMD_ALLELES_in_GENOME_STRAND', String(100) ,nullable=False),
Column('HGMDinGene', String(100) ,nullable=False),
Column('K1_total_depth_at_site', String(100) ,nullable=False),
Column('K1_rms_mapping_quality', String(100) ,nullable=False),
Column('K1_num_samples_with_covg', String(100) ,nullable=False),
Column('K1_num_alleles_in_samples_with_covg', String(100) ,nullable=False),
Column('K1_alt_allele_cnts_in_samples_with_covg', String(100) ,nullable=False),
Column('K1_alt_allele_freqs', String(100) ,nullable=False),
Column('K1_allele_balance_in_hets', String(100) ,nullable=False),
#Column('data',String(50000),nullable=False),
)
all_genotypes_table = Table(
'all_genotypes_table_06_02',metadata,
Column('myindex',Integer,primary_key=True),
Column('varid',Integer, nullable=False),
Column('sample_code',String(30),nullable=False),
Column('GT',String(800),nullable=False),
Column('mpg_score',String(10), nullable=True),
Column('coverage', String(10) ,nullable = True ),
)
Index('idx_popdb', popdb_table.c.Chr, popdb_table.c.hg19LeftFlank, popdb_table.c.hg19RightFlank)
Index('idx_gt', all_genotypes_table.c.varid)
metadata.create_all()
my_s = popdb_table.select().order_by(popdb_table.c.myindex.desc()).limit(1)
a = my_s.execute()
last_index=0
for x in a:
last_index = x[0]
#print 'last_index',last_index
break
fr = open(mycfg.population_file)
headerline = fr.readline()
htl = headerline.strip().split('\t')
htl.insert(0,'var_allele_count')
htl.insert(0,'ref_allele_count')
htl.insert(0,'var_gt_count')
htl.insert(0,'ref_gt_count')
htl.insert(0,'all_mpg:coverage_score')
myfile = varfile(htl)
patient_pos = myfile.first_sample_index
linecounter=0
duplicates=0
fr = open(mycfg.population_file)
fr.readline()
for line in fr:
line = unicode(line, 'utf-8')
linecounter+=1
if linecounter <= last_index:
continue
#print line;time.sleep(0.5)
tl=line.strip().split('\t')
tl.insert(0,'-')
tl.insert(0,'-')
tl.insert(0,'-')
tl.insert(0,'-')
tl.insert(0,'-')
newline = varline(tl,htl)
if not (tl[ htl.index('hg19LeftFlank')].isdigit() and
tl[ htl.index('hg19RightFlank')].isdigit()):
pass
else:
tl[htl.index('var_allele_count')] = str(newline.var_allele_count)
tl[htl.index('ref_allele_count')] = str(newline.ref_allele_count)
tl[htl.index('var_gt_count')] = str(newline.var_gt_count)
tl[htl.index('ref_gt_count')] = str(newline.ref_gt_count)
tl[htl.index('all_mpg:coverage_score')] = str(newline.mpg_coverage_score)
#DO NOT USE REFSEQ as part of the key as it creates errors
#when the same position is annotated with different gene names
# there are a few indel based multiple variants with this key combination
#this is currently not handled by var-md
#work on this later on
last_index +=1
CS_c_genotypes = tl[htl.index('CS_801c_genotypes')]
CS_c_homref = tl[htl.index('CS_801c_homref')]
CS_c_het = tl[htl.index('CS_801c_het')]
CS_c_homvar = tl[htl.index('CS_801c_homvar')]
CS_c_hetnonref = tl[htl.index('CS_801c_hetnonref')]
CS_c_other = tl[htl.index('CS_801c_other')]
CS_c_refallele = tl[htl.index('CS_801c_refallele')]
CS_c_varallele = tl[htl.index('CS_801c_varallele')]
CS_freq_homref = tl[htl.index('CS_801freq_homref')]
CS_freq_het = tl[htl.index('CS_801freq_het')]
CS_freq_homvar = tl[htl.index('CS_801freq_homvar')]
CS_freq_hetnonref = tl[htl.index('CS_801freq_hetnonref')]
CS_freq_refallele = tl[htl.index('CS_801freq_refallele')]
CS_freq_varallele = tl[htl.index('CS_801freq_varallele')]
CS_ref_is_minor = tl[htl.index('CS_801ref_is_minor')]
CS_c_major = tl[htl.index('CS_801c_major')]
CS_c_minor = tl[htl.index('CS_801c_minor')]
CS_maf = tl[htl.index('CS_801maf')]
CS_chisquare = tl[htl.index('CS_801chisquare')]
#myentry = ['%s::%s'%(htl[i],tl[i]) for i in range(76,len(tl))]
#for each item in the line add column header as key and item as value
ins = popdb_table.insert()
ins.execute(
myindex = last_index ,
NISC_Index = tl[htl.index('Index')],
Chr = tl[htl.index('Chr')],
LeftFlank = tl[htl.index('LeftFlank')],
RightFlank = tl[htl.index('RightFlank')],
hg19LeftFlank = tl[htl.index('hg19LeftFlank')],
hg19RightFlank = tl[htl.index('hg19RightFlank')],
ref_allele = tl[htl.index('ref_allele')],
var_allele = tl[htl.index('var_allele')],
muttype = tl[htl.index('muttype')],
variant_type = tl[htl.index('type')],
var_allele_count = newline.var_allele_count,
ref_allele_count = newline.ref_allele_count,
var_gt_count = newline.var_gt_count,
ref_gt_count = newline.ref_gt_count,
all_mpg_coverage_score = newline.mpg_coverage_score,
Gene_name = tl[htl.index('Gene_name')],
loc_type = tl[htl.index('loc_type')],
consequence = tl[htl.index('consequence')],
All_Gahlc_genotypes_NISC = tl[htl.index('All_Gahlc_genotypes')],
All_Gahlc_homref_NISC = tl[htl.index('All_Gahlc_homref')],
All_Gahlc_het_NISC = tl[htl.index('All_Gahlc_het')],
All_Gahlc_homvar_NISC = tl[htl.index('All_Gahlc_homvar')],
All_Gahlc_hetnonref_NISC = tl[htl.index('All_Gahlc_hetnonref')],
All_Gahlc_other_NISC = tl[htl.index('All_Gahlc_other')],
All_Gahlc_refallele_NISC = tl[htl.index('All_Gahlc_refallele')],
All_Gahlc_varallele_NISC = tl[htl.index('All_Gahlc_varallele')],
All_Gahlfreq_homref_NISC = tl[htl.index('All_Gahlfreq_homref')],
All_Gahlfreq_het_NISC = tl[htl.index('All_Gahlfreq_het')],
All_Gahlfreq_homvar_NISC = tl[htl.index('All_Gahlfreq_homvar')],
All_Gahlfreq_hetnonref_NISC = tl[htl.index('All_Gahlfreq_hetnonref')],
All_Gahlfreq_refallele_NISC = tl[htl.index('All_Gahlfreq_refallele')],
All_Gahlfreq_varallele_NISC = tl[htl.index('All_Gahlfreq_varallele')],
All_Gahlref_is_minor_NISC = tl[htl.index('All_Gahlref_is_minor')],
All_Gahlc_minor_NISC = tl[htl.index('All_Gahlref_is_minor')],
All_Gahlmaf_NISC = tl[htl.index('All_Gahlmaf')],
All_Gahlchisquare_NISC = tl[htl.index('All_Gahlchisquare')],
CS_c_genotypes_NISC = CS_c_genotypes,
CS_c_homref_NISC = CS_c_homref,
CS_c_het_NISC = CS_c_het ,
CS_c_homvar_NISC = CS_c_homvar ,
CS_c_hetnonref_NISC = CS_c_hetnonref ,
CS_c_other_NISC = CS_c_other ,
CS_c_refallele_NISC = CS_c_refallele ,
CS_c_varallele_NISC = CS_c_varallele ,
CS_freq_homref_NISC = CS_freq_homref ,
CS_freq_het_NISC = CS_freq_het ,
CS_freq_homvar_NISC = CS_freq_homvar ,
CS_freq_hetnonref_NISC = CS_freq_hetnonref ,
CS_freq_refallele_NISC = CS_freq_refallele ,
CS_freq_varallele_NISC = CS_freq_varallele ,
CS_ref_is_minor_NISC = CS_ref_is_minor ,
CS_c_major_NISC = CS_c_major ,
CS_c_minor_NISC = CS_c_minor ,
CS_maf_NISC = CS_maf ,
CS_chisquare_NISC = CS_chisquare ,
transcript = tl[htl.index('transcript')],
ref_aa = tl[htl.index('ref_aa')],
var_aa = tl[htl.index('var_aa')],
aa_pos = tl[htl.index('aa_pos')],
CDPred_score = tl[htl.index('CDPred_score')],
RS_number = tl[htl.index('RS#')],
HGMDids = tl[htl.index('HGMDids')],
HGMDdisease = tl[htl.index('HGMDdisease')],
HGMDtags = tl[htl.index('HGMDtags')],
HGMD_HGVS = tl[htl.index('HGMD_HGVS')],
REF_MATCHES_HGMD_DISEASE_ALLELE = tl[htl.index('REF_MATCHES_HGMD_DISEASE_ALLELE')],
HGMD_ALLELES_in_GENOME_STRAND = tl[htl.index('HGMD_ALLELES_in_GENOME_STRAND')],
HGMDinGene = tl[htl.index('HGMDinGene')],
K1_total_depth_at_site = tl[htl.index('1K_total_depth_at_site')],
K1_rms_mapping_quality = tl[htl.index('1K_rms_mapping_quality')],
K1_num_samples_with_covg = tl[htl.index('1K_#samples_with_covg')],
K1_num_alleles_in_samples_with_covg = tl[htl.index('1K_#alleles_in_samples_with_covg')],
K1_alt_allele_cnts_in_samples_with_covg = tl[htl.index('1K_alt_allele_cnts_in_samples_with_covg')],
K1_alt_allele_freqs = tl[htl.index('1K_alt_allele_freqs')],
K1_allele_balance_in_hets = tl[htl.index('1K_allele_balance_in_hets')],
#data = ';;'.join(myentry),
)
#myentry = ['%s::%s'%(htl[i],tl[i]) for i in range(76,len(tl))]
#print '='*40
#for i in range(76,len(tl)-2,3):
for i in range(patient_pos, len(tl)-2,3):
#print htl[i], tl[i:i+3]
#time.sleep(0.1)
ins = all_genotypes_table.insert()
ins.execute(
myindex = None,
varid = last_index,
sample_code = htl[i].replace('.NA','') ,
GT = tl[i] ,
mpg_score = tl[i+1] ,
coverage = tl[i+2] ,
)
if linecounter%30000 == 0:
print linecounter
#result=c.execute('select * from kgdict limit 10''')
#print result.fetchone()
#time.sleep(2)
print 'finished processing %s'%mycfg.population_file
print 'Total %s lines had duplicates'%duplicates
print 'popdb database created'
sendmail()
def hg19_bin(chr,pos,coords19):
i=0
for coord19 in coords19:
if chr.lower().replace('chr','') == coord19[0].lower().replace('chr','') and int(coord19[1]) <= int(pos) and int(coord19[2]):
return i
break
else:
pass
i+=1
def dbsnp_vcf2db():
"""
Populates the mysql database with dbsnp data
"""
kgdict_table = Table(
'kgdict',metadata,
Column('myindex',Integer,primary_key=True),
Column('Chr',String(10),nullable=False),
#Column('bin',Integer,nullable=False),
Column('pos',Integer,nullable=False),
Column('id',String(100),nullable=False),
Column('ref',String(800),nullable=False),
Column('alt',String(300),nullable=False),
Column('qual',String(200),nullable=False),
Column('filter',String(20),nullable=False),
Column('info',String(200),nullable=False),
)
Index('idx_kgdict', kgdict_table.c.Chr, kgdict_table.c.pos)
metadata.create_all()
#mydbpath = os.path.join('/scratch',os.path.basename(mycfg.dbsnp_db_path))
#conn = sqlite3.connect(mydbpath)
#c=conn.cursor()
#c.execute('''
#create table IF NOT EXISTS kgdict (chr INTEGER,
# bin INTEGER,
# pos INTEGER,
# id TEXT,
# ref TEXT,
# alt TEXT,
# qual TEXT,
# filter TEXT,
# info)''')
linecounter=0
import gzip
for line in gzip.open(mycfg.dbsnp_vcf_path):
linecounter+=1
if not line.startswith('#'):
#print line;time.sleep(0.5)
tl=line.strip().split('\t')
if tl[0]=='MT': #correct the mitochondrial chromosome designation for internal consistency
tl[0]='M'
mytl7 = tl[7].split(';')
for x in mytl7:
if 'RSPOS=' in x:
mytl7.remove(x)
if 'VP=' in x:
mytl7.remove(x)
ins = kgdict_table.insert()
ins.execute(
myindex=None,
Chr= tl[0],
#bin = '',
pos = tl[1],
id = tl[2],
ref = tl[3],
alt = tl[4],
qual = tl[5],
filter = tl[6],
info = ';'.join(mytl7)
)
#except Exception:
# print "can't insert line in to db", tl
if linecounter %500000 == 0:
print linecounter
#result=c.execute('select * from kgdict limit 10''')
#print result.fetchone()
#time.sleep(2)
print 'database created at %s'%mycfg.dbsnp_db_path
#c.execute('CREATE INDEX idx_kgdict ON kgdict (chr,bin,pos)')
#conn.commit()
#c.close()
#os.system('cp %s %s'%(mydbpath, os.path.dirname(os.path.abspath(sys.argv[0]))))
sendmail()
def ESP_vcf2db():
"""
Populates the mysql database with Exome Sequencing Project data
"""
ESP_variants_table = Table(
'ESP_variants',metadata,
Column('myindex',Integer, primary_key = True),
Column('Chr',String(10),nullable = False),
#Column('bin',Integer,nullable=False),
Column('pos',Integer,nullable=False, autoincrement = False),
Column('id',String(100),nullable=False),
Column('ref',String(800),nullable=False),
Column('alt',String(300),nullable=False),
Column('qual',String(200),nullable=False),
Column('filter',String(20),nullable=False),
Column('DBSNP',String(100),nullable=False),
Column('EA_AC',String(100), nullable = False ),
Column('AA_AC',String(100), nullable = False ),
Column('TAC',String(100), nullable = False ),
Column('info',String(700),nullable = False),
)
Index('idx_ESP_variants', ESP_variants_table.c.Chr, ESP_variants_table.c.pos)
metadata.create_all()
linecounter=0
for line in open(args.input_file):
linecounter+=1
if not line.startswith('#'):
#print line;time.sleep(0.5)
tl=line.strip().split('\t')
if tl[0]=='MT': #correct the mitochondrial chromosome designation for internal consistency
tl[0]='M'
mytl7 = tl[7].split(';') #split info field
dbsnp=''
ea_ac=''
aa_ac=''
tac=''
mytlinfo= deepcopy(mytl7)
for x in mytl7:
if 'DBSNP=' in x:
dbsnp = x.split("=")[1]
mytlinfo.remove(x)
elif 'EA_AC=' in x:
ea_ac = x.split("=")[1]
mytlinfo.remove(x)
elif 'AA_AC=' in x:
aa_ac = x.split("=")[1]
mytlinfo.remove(x)
elif 'TAC=' in x:
tac = x.split("=")[1]
mytlinfo.remove(x)
ins = ESP_variants_table.insert()
ins.execute(
myindex=None,
Chr= tl[0],
#bin = '',
pos = tl[1],
id = tl[2],
ref = tl[3],
alt = tl[4],
qual = tl[5],
filter = tl[6],
DBSNP = dbsnp,
EA_AC = ea_ac,
AA_AC = aa_ac,
TAC = tac,
info = ';'.join(mytlinfo)
)
if linecounter %500000 == 0:
print linecounter
print 'database created at mysql server '
sendmail()
def prepare_liftover_input(vsfile, outfile):
"""
input: vsfile provided by the function call
logic: create a bed file that contains chr start stop and index fields from\
the vs.filtered file
output:created bed file to be used as input for the liftover program for adding\
hg19 coordinates to the files
"""
try:
infile = open(vsfile)
except Exception:
print 'The input file %s was not found, please make sure you have supplied the correct path'
raise Exception
hline = infile.readline()
htl = hline.strip().split('\t')
fw = open(outfile,'w')
#fw.write(inlines[0])
for line in infile:
tl=line.strip().split('\t')
ntl='%s '*4%(tl[htl.index('Chr')],tl[htl.index('LeftFlank')],tl[htl.index('RightFlank')],tl[htl.index('Index')])
#print 'ntl',ntl
#time.sleep(100)
fw.write(ntl+'\n')
#time.sleep(2)
def add_hg19(infile,hg19bed,hg19outfile):
"""
infile is the hg18 coordinate varsifter file, hg19coordinates in a
liftover created bed file, [hg19outfile] is the outputfile name
logic: adds hg19 coordinates to infile based on index numbers
output: new file with hg19 coordinates added
"""
try: #create a dictionary where the key is the index number of the input file and the value
#is the hg19 coordinates for faster search
hg19file = open(hg19bed)
hg19lifts={}
for chr in chrs:
hg19lifts[chr]={}
"""
for x in hg19file:
tl=x.strip().split('\t')
#if tl[0].lower()==chr.lower():
#hg19lifts[tl[3]]=tl[1:3]
hg19lifts[tl[3]]=tl[1:3]
"""
for chr in chrs:
hg19file.seek(0)
for x in hg19file:
tl=x.strip().split('\t')
if tl[0].lower()==chr.lower():
hg19lifts[chr][tl[3]]=tl[1:3]
except Exception:
print 'hg19bed file was not read into memory. Please check add_hg19 function for details.'
fw=open(hg19outfile,'w')
infile = open(infile)
hline = infile.readline()
htl = hline.strip().split('\t')
nhtl = htl[:htl.index('RightFlank')+1]+['hg19LeftFlank','hg19RightFlank']+htl[htl.index('RightFlank')+1:]
fw.write('\t'.join(nhtl)+'\n')
#outputcounter=0
for line in infile:
tl=line.strip().split('\t')
#print tl[0]
ntl=[]
#time.sleep(1)
if hg19lifts.get(tl[htl.index('Chr')]):
if tl[htl.index('Index')] in hg19lifts.get(tl[htl.index('Chr')]).keys():
#print 'tl',tl
#print 'hg19lifts tl[]',hg19lifts[tl[0]]
#outputcounter+=1
#print 'tl',tl
#print 'hg19tl',hg19tl
#ntl=tl[:4]+hg19tl[1:3]+tl[4:]
ntl = tl[:htl.index('RightFlank')+1] + hg19lifts[tl[htl.index('Chr')]][tl[htl.index('Index')]] + tl[htl.index('RightFlank')+1:]
#print 'ntl',ntl
fw.write('\t'.join(ntl)+'\n')
#time.sleep(1)
#elif tl[0] not in hg19lifts[tl[htl.index('Chr')]].keys():
else:
#outputcounter+=1
ntl=tl[:htl.index('RightFlank')+1]+['-','-']+tl[htl.index('RightFlank')+1:]
fw.write('\t'.join(ntl)+'\n')
#print 'Number of Output lines are %s '%outputcounter
fw.close()
return
def single_gts(tl1,htl, sample_code):
"""
This function gets a line and a headerline as input,
splits it by tab and determines the genotype of individual
returns a dictionary with UDP code as keys and pt raw genotypes and
pt calculated genotypes as values
gts [0]= patient genotype with two alleles
gts [1]= patient calculated zygosity state (hom_ref,
hom_var,
het_complete,
var_incoplete,
ref_incomplete,
novel_mutation,
ref_novel_het,
var_novel_het,
NA )
gts [2]= mpg score
gts [3]= coverage
gts [4]= sex
gts [5]= affected
gts [6]= mom udpcode
gts [7]= dad udpcode
gts [8]= founder GT in DB
gts [9]= birth year
"""
test=0
gts={}
if len(tl1) != len(htl):
sys.stderr.write('tl != htl len(tl)= %s len(htl)=%s\n'%(len(tl1), len(htl)))
sys.stderr.write('tl= %s\n'%tl1)
sys.stderr.write('htl= %s\n'%htl)
raise Exception
myvarfile = varfile(htl)
#myvarfile.get_project_members(project_code)
#myaffected = myvarfile.project_members
pos = myvarfile.first_sample_index
if test:
print 'this is pos as a result of using myvarfile.first_sample_index\
with htl ',pos
print 'this is tl [pos-5:pos+5]',tl1[int(pos)-5:int(pos)+5]
print 'This is the item at tl[pos]',tl1[pos]
time.sleep(1)
print '\nline in get_genotypoes() is ',tl1
print 'patient data in line in get_genotypes_ is ', tl1[pos-1:]
#print 'number of patients determined by get_genotypes_() is ', len(myaffected)
time.sleep(1)
#for sample_code,v in myaffected.iteritems():
#print 'in get_genotypes_ function: pt_code is => %s'%sample_code
#time.sleep(1)
#print "data dictionary = 1:hom ref,2:hom var, 3:het complete,\
#4:ref incomplete,5:var incompelete, 6:single novel mutation,\
#7:ref novel het, 8:var novel het , 9:hom_novel"
gt_class=""
if len(tl1[htl.index('%s.NA'%sample_code)])<2:
if test:
print "found incomplete genotype",tl1
print tl1[htl.index('%s.NA'%sample_code)][allele]
#if tl1[htl.index('%s.NA'%sample_code)]==tl1[myfilespecs["var_allele"]]:
if tl1[htl.index('%s.NA'%sample_code)] == tl1[htl.index('var_allele')]:
gt_class="var_incomplete"
#elif tl1[htl.index('%s.NA'%sample_code)]==tl1[myfilespecs["ref_allele"]]:
elif tl1[htl.index('%s.NA'%sample_code)] == tl1[htl.index('ref_allele')]:
gt_class="ref_incomplete"
elif (tl1[htl.index('%s.NA'%sample_code)] != tl1[htl.index("ref_allele")] and
tl1[htl.index('%s.NA'%sample_code)] != tl1[htl.index("var_allele")]):
gt_class="novel_mutation"
else:
print "something wrong check the row",tl1
#time.sleep(1)
elif len(tl1[htl.index('%s.NA'%sample_code)])==2:
varcount=0
refcount=0
novelcount=0
if tl1[htl.index('%s.NA'%sample_code)]=="NA":
gt_class="NA"
else:
#print "this is a valid genotype =>",tl1[htl.index('%s.NA'%sample_code)]
#print tl1[htl.index('%s.NA'%sample_code)]
#print tl1[htl.index('var_allele']]
for allele in xrange(2):
#print tl1[htl.index('%s.NA'%sample_code)][allele]
if tl1[htl.index('%s.NA'%sample_code)][allele]==tl1[htl.index("var_allele")]:
varcount+=1
elif tl1[htl.index('%s.NA'%sample_code)][allele]==tl1[htl.index("ref_allele")]:
refcount+=1
elif (tl1[htl.index('%s.NA'%sample_code)][allele]!=tl1[htl.index("ref_allele")] and
tl1[htl.index('%s.NA'%sample_code)][allele]!=tl1[htl.index("var_allele")]):
novelcount+=1
#a=freq[tl1[htl.index('%s.NA'%sample_code)][allele]]
#print a[0],a[1]
#freql.append(a[1]+1)
#freq[tl1[htl.index('%s.NA'%sample_code)][allele]]=[gt_id[0][1],a[1]+1]
#print "varcount",varcount
#print "ref count",refcount
if varcount==2:
gt_class="hom_var"
elif refcount==2:
gt_class="hom_ref"
elif novelcount==2:
gt_class="hom_novel"
elif refcount==1 and varcount==1:
gt_class="het_complete"
elif refcount==1 and novelcount==1:
gt_class="ref_novel_het"
elif varcount==1 and novelcount==1:
gt_class="var_novel_het"
else:
print "something wrong check the row",tl1[a:a+3]
#print 'line is', line
#print 'calculated genotype class is',gt_class
#time.sleep(1)
elif len(tl1[htl.index('%s.NA'%sample_code)])>2:
if ":" in tl1[htl.index('%s.NA'%sample_code)]:
varcount=0
refcount=0
novelcount=0
#print 'found a double colon seperated one',tl1
for allele in xrange(2):
#print "allele = %s "%allele
allele_gt=""
if allele==0:
allele_gt=tl1[htl.index('%s.NA'%sample_code)][:(tl1[htl.index('%s.NA'%sample_code)].index(":"))]
elif allele==1:
allele_gt=tl1[htl.index('%s.NA'%sample_code)][(tl1[htl.index('%s.NA'%sample_code)].index(":"))+1:]
else:
print "allele gt could not be determined"
print "allele_gt is:",allele_gt
#print tl1[htl.index('%s.NA'%sample_code)][allele]
if allele_gt==tl1[htl.index("ref_allele")]:
refcount+=1
elif allele_gt==tl1[htl.index("var_allele")]:
varcount+=1
elif allele_gt!=tl1[htl.index("ref_allele")] and allele_gt!=tl1[htl.index("var_allele")]:
novelcount+=1
else:
print "something wrong with determining the genotype counts , check the row",tl1[a:a+3]
if varcount==2:
gt_class="hom_var"
elif refcount==2:
gt_class="hom_ref"
elif novelcount==2:
gt_class="hom_novel"
elif refcount==1 and varcount==1:
gt_class="het_complete"
elif refcount==1 and novelcount==1:
gt_class="ref_novel_het"
elif varcount==1 and novelcount==1:
gt_class="var_novel_het"
else:
print "ref gt=",tl1[htl.index("ref_allele")],len(tl1[htl.index("ref_allele")])
print "var gt=",tl1[htl.index("var_allele")],len(tl1[htl.index("var_allele")])
print "patient gt=",tl1[htl.index('%s.NA'%sample_code)],tl1[htl.index('%s.NA'%sample_code)][:(tl1[htl.index('%s.NA'%sample_code)].index(":"))],len(tl1[htl.index('%s.NA'%sample_code)][:(tl1[htl.index('%s.NA'%sample_code)].index(":"))]),tl1[htl.index('%s.NA'%sample_code)][(tl1[htl.index('%s.NA'%sample_code)].index(":"))+1:],len(tl1[htl.index('%s.NA'%sample_code)][(tl1[htl.index('%s.NA'%sample_code)].index(":"))+1:])
print "rc",refcount,"vc",varcount,"nc",novelcount
print "something wrong determining gt class variables, check the row",tl1[a:a+3]
#print ">2 freq1",freq1
#time.sleep(1)
else:
if tl1[htl.index('%s.NA'%sample_code)]==tl1[htl.index("var_allele")]:
gt_class="hom_var"
elif tl1[htl.index('%s.NA'%sample_code)]==tl1[htl.index("ref_allele")]:
gt_class="hom_ref"
elif tl1[htl.index('%s.NA'%sample_code)]!=tl1[htl.index("ref_allele")] and tl1[htl.index('%s.NA'%sample_code)]!=tl1[htl.index("var_allele")]:
gt_class="het_complete"
else:
print "something wrong check the row",tl1[a:a+3],tl1[htl.index('%s.NA'%sample_code)]
else:
print 'row is not 2 or more characters long check row',tl1
#read family data table in to a dictionary
#patient_ids, family_data = p_ids()
#print 'pt_code in get genotypes() function is =>',sample_code
gts[sample_code] = [tl1[htl.index('%s.NA'%sample_code)],
gt_class,
tl1[htl.index('%s.NA'%sample_code)+1],
#tl1[htl.index('%s.NA'%sample_code)+2]] + family_data[sample_code]
tl1[htl.index('%s.NA'%sample_code)+2]]
"""
print 'could not get value into gts dictionary, family data may be'\
'inconsistent with the variant report file make sure the patient'\
'codes match between the two files'
print 'Family data returned by p_ids() function as family_data =>',family_data
print '\npt_code in get genotypes() is %s \n'%sample_code
print 'tl1:',tl1
print 'gts dictionary is as follows'
for k,v in gts.items():
print k,'==>',v
raise Exception
#print 'counter is %s'%counter
#time.sleep(.5)
"""
#test code for the fucntion
#if len(gts)!= len(myaffected):
# print "num_pts is %s"%len(myaffected)
# print "gts is %s items long"%len(gts)
# print tl1
# print 'gts:',gts
# raise Exception
#for k,v in gts.items():
#print 'genotype dict entry is k =%s v=>%s'%(k,v)
#time.sleep(1)
# assert len(v)==11,'genotypes dictionary was not constructed correctly, please\
# check the results k=%s, v=%s v is %s items long'%(k,v,len(v))
#time.sleep(1)
#print 'testcode gts=',gts
#print tl1
#time.sleep(30)
return gts
def split_allfams_bysample():
"""
takes and allfams file and takes all variant columns and a single sample that is supplied by udp numbers
"""
log_file_path = os.path.join(os.path.dirname(args.output_file), 'log.txt')
log_file = open(log_file_path, 'a')
infile = open(args.input_file)
hline =infile.readline()
htl = hline.strip().split('\t')
myvarfile = varfile(htl)
#print myvarfile.first_sample_index
nhtl = htl[:myvarfile.first_sample_index]
fam_members_indexes=[]
no_samples_in_current_all_projects_file = True
if '%s.NA'%args.project_code in htl:
#print htl.index('%s.NA'%sample)
fam_members_indexes.append(htl.index('%s.NA'%args.project_code))
log_file.write('Sample %s is found in the input allfamilies file\n'%(args.project_code))
no_samples_in_current_all_projects_file = False
else:
log_file.write('Sample %s is not found in the input allfamilies file\n'%(args.project_code))
for x in fam_members_indexes:
nhtl.append(htl[x])
nhtl.append(htl[x+1])
nhtl.append(htl[x+2])
myout = open(args.output_file,'w')
if args.extract_subject_only:
myout.write('\t'.join(nhtl)+'\n')
else:
myout.write('\t'.join(htl)+'\n')
if no_samples_in_current_all_projects_file:
pass
else:
#c=0
#c1=0
for line in infile:
#c +=1
#print c
#if c>200:
# break
tl = line.strip().split('\t')
ntl = tl[:myvarfile.first_sample_index]
for x in fam_members_indexes:
ntl.append(tl[x])
ntl.append(tl[x+1])
ntl.append(tl[x+2])
mygts = single_gts(ntl, nhtl , args.project_code)
assert len(mygts)==1
all_are_homref_or_NA = True
gt_homref_and_ref_is_minor = False
for k,v in mygts.iteritems():
if (v[1] != 'hom_ref' and not tl[htl.index('All_Gahlref_is_minor')]=="1") and v[1] != 'NA':
all_are_homref_or_NA = False
if (v[1] == 'hom_ref' and tl[htl.index('All_Gahlref_is_minor')]=="1"):
gt_homref_and_ref_is_minor = True
if not all_are_homref_or_NA or gt_homref_and_ref_is_minor:
#c1+=1
#print mygts
#print tl[htl.index('All_Gahlref_is_minor')]
#time.sleep(0.1)
if args.extract_subject_only:
myout.write('\t'.join(ntl)+'\n')
else:
myout.write('\t'.join(tl)+'\n')
myout.close()
#else:
# sys.stderr.write(line)
#myvarfile = varfile(args.input_file)
#myvarfile.get_project_members(args.project_code)
#myaffected = myvarfile.project_members
def split_allfams_byfamily():
"""
takes and allfams file and takes all variant columns and only samples that are supplied in the family file by udp numbers
"""
samples_table = Table('varmd_samples_table', metadata, autoload=True)
infile = open(args.input_file)
hline =infile.readline()
htl = hline.strip().split('\t')
myvarfile = varfile(htl)
myaffected = myvarfile.get_project_members(args.project_code)
#print myvarfile.first_sample_index
nhtl = htl[:myvarfile.first_sample_index]
fam_members_indexes=[]
no_samples_in_current_all_projects_file = True
for sample in myaffected.keys():
if '%s.NA'%sample in htl:
#print htl.index('%s.NA'%sample)
fam_members_indexes.append(htl.index('%s.NA'%sample))
sys.stdout.write('FAMILY/PROJECT CODE= %s : Sample %s is found in the input allfamilies file\n'%(args.project_code, sample))
updt = samples_table.update().where(samples_table.c.sample_code == sample).values(
data_in_allprojects_file = 'Yes')
updt.execute()
no_samples_in_current_all_projects_file = False
else:
sys.stdout.write('NOTICE: FAMILY/PROJECT CODE= %s : Sample %s is not found in the input allfamilies file\n'%(args.project_code, sample))
updt = samples_table.update().where(samples_table.c.sample_code == sample).values(
data_in_allprojects_file = 'No')
updt.execute()
#raise Exception
for x in fam_members_indexes:
nhtl.append(htl[x])
nhtl.append(htl[x+1])
nhtl.append(htl[x+2])
myout = open(args.output_file,'w')
myout.write('\t'.join(nhtl)+'\n')
if no_samples_in_current_all_projects_file:
pass
else:
first_sample_index = myvarfile.first_sample_index
patient_ids, family_data = p_ids()
different_phenotypes_are_present = False
if len(set([v['affected'] for v in myaffected.values()]) ) > 1:
different_phenotypes_are_present = True
#c=0
for line in infile:
#c +=1
#print c
#if c>800:
# break
tl = line.strip().split('\t')
ntl = tl[:first_sample_index]
for x in fam_members_indexes:
ntl.append(tl[x])
ntl.append(tl[x+1])
ntl.append(tl[x+2])
mygts = get_genotypes(ntl, nhtl, myaffected, first_sample_index, family_data)
all_affecteds_are_hom_ref = True
all_are_homref_or_NA = True
all_genotypes_are_same = True
if len(set([v[1] for v in mygts.values()])) > 1:
all_genotypes_are_same = False
elif len(set([v[1] for v in mygts.values()])) == 1:
all_genotypes_are_same = True
elif len(set([v[1] for v in mygts.values()])) == 0 and len(mygts) == 0:
pass
else:
for k,v in mygts.iteritems():
print k,v
print (set([v[1] for v in mygts.values()]))
raise Exception
for k,v in mygts.iteritems():
if v[1] != 'hom_ref' and v[1] != 'NA':
all_are_homref_or_NA = False
if int(myaffected[k]['affected']) == 1 and v[1] != 'hom_ref':
all_affecteds_are_hom_ref = False
if ( (not all_are_homref_or_NA) and
(not all_affecteds_are_hom_ref) and
(not (different_phenotypes_are_present and all_genotypes_are_same))
):
myout.write('\t'.join(ntl)+'\n')
myout.close()
#else:
# sys.stderr.write(line)
#myvarfile = varfile(args.input_file)
#myvarfile.get_project_members(args.project_code)
#myaffected = myvarfile.project_members
def annotate_with_interval_data():
"""
python --func annotate_with_interval_data --input [varmd_file] --annotation_file [validation_file] > outputfile
"""
annot_lines = [x.strip().split('\t') for x in open(args.annotation_file) if 'chr' in x.lower()]
if (not 'Chr' in annot_lines[0]) or (not 'LeftFlank' in annot_lines[0]) or (not 'RightFlank' in annot_lines[0]) :
raise Exception
assert len(annot_lines) >=2, 'annotation file seems empty'
infile = open(args.input_file)
htl = infile.readline().strip().split('\t')
if (not 'Chr' in htl) or (not 'LeftFlank' in htl) or (not 'RightFlank' in htl) :
print htl
raise Exception
htl.insert(0,'Validation')
print '\t'.join(htl)
used_for_annotations = []
for line in infile:
tl = line.strip().split('\t')
tl.insert(0,'-')
for atl in annot_lines[1:]:
if (tl[htl.index('Chr')] == atl[annot_lines[0].index('Chr')] and
tl[htl.index('LeftFlank')] == atl[annot_lines[0].index('LeftFlank')] and
tl[htl.index('RightFlank')] == atl[annot_lines[0].index('RightFlank')]):
tl[htl.index('Validation')] = atl[0]
annot_lines.remove(atl)
used_for_annotations.append(atl)
print '\t'.join(tl)
"""
print 'tl',tl
print 'atl',atl
print 'annot_lines',annot_lines
print 'annot_lines[0]',annot_lines[0]
print 'htl.index(chr)',htl.index('Chr')
"""
with open('%s.used'%args.annotation_file,'a') as fw:
for atl in used_for_annotations:
fw.write('\t'.join(atl)+'\n')
def run_analysis_single_file(configfile):
"""
creates the necessary files for analysis pipeline workflow under Biowulf cluster
"""
test=0
verbose=0
skip_hg19_add_flag = 0
nodetype = mycfg.nodetype
#print os.path.dirname(inputvsfile)
if args.input_file:
inputvsfile = os.path.abspath(args.input_file)
else:
print 'Please provide a varmd input file by --input [varmd_input_file_path]'
sys.exit()
if args.project_code:
project_code = args.project_code
else:
print 'please provide a project code by --project_code [project_code]'
sys.exit()
tmpdir='{0}_tmp'.format(os.path.join( os.path.split(os.path.dirname(inputvsfile))[0],
os.path.basename(inputvsfile)
))
#tmpdir='./{}_tmp'.format(os.path.basename(inputvsfile))
if verbose:
print os.path.dirname(os.path.abspath(inputvsfile))
print os.path.basename(inputvsfile)
print 'tmpdir',tmpdir
if not os.path.exists(tmpdir):
os.mkdir(tmpdir)
masterpath='%s/%s_master.sh'%(tmpdir,os.path.basename(inputvsfile))
fm=open(masterpath,'w')
if nodetype=='g72':
nodetypeswarm=':g72 -n 8'
nodetypeqsub=':g72'
elif nodetype=='g24':
nodetypeswarm=':g24 -n 8'
nodetypeqsub=':g24'
elif nodetype=='norm':
nodetypeswarm=''
nodetypeqsub=''
else:
print 'something wrong with the node selection'
sys.exit()
depend=0
with open(inputvsfile) as myvsfile:
hline = myvsfile.readline()
if ('hg19LeftFlank' in hline) and ('hg19RightFlank' in hline):
skip_hg19_add_flag =1
if skip_hg19_add_flag:
#the header line should be processed and hg19LeftFlank and hg19RightFlank should be present in the file
hg19outfile = '{0}/{1}_withhg19'.format(tmpdir,os.path.basename(inputvsfile))
os.system('cp {0} {1}'.format(inputvsfile,hg19outfile))
else:
##############################################################################
outfile='%s/%s_liftbed'%(tmpdir,os.path.basename(inputvsfile))
prep_liftover_path='%s/1_%s_prepare_liftover_input.sh'%(tmpdir,
os.path.basename(inputvsfile)
)
plf=open(prep_liftover_path,'w')
#plf.write('%s %s --func prepare_liftover_input %s %s'%(myutils.python_path,
plf.write('%s --func prepare_liftover_input --config %s --input %s --output %s'%(
sys.argv[0],
configfile,
inputvsfile,
outfile
))
plf.close()
#prepare the input file for liftOver
depend+=1
fm.write('jobid{0}=`qsub -l nodes=1{1} -N lft2{3} {4}`\n'.format(
depend,
nodetypeqsub,
depend-1,
project_code,
prep_liftover_path))
fm.write('echo $jobid{0}\n'.format(depend))
###############################################################################
liftoverpath='%s/2_%s.fixed_liftover.sh'%(tmpdir,os.path.basename(inputvsfile))
lf=open(liftoverpath,'w')
liftoverout = '{0}/{1}_liftedbed'.format(tmpdir,os.path.basename(inputvsfile))
liftoverunmapped = '{0}/{1}_hg19liftover_unmapped'.format(tmpdir,os.path.basename(inputvsfile))
lf.write('{0} {1} {2} {3} {4}'.format(
mycfg.liftOver_path,
outfile,
mycfg.liftOver_chain_path,
liftoverout,
liftoverunmapped
))
lf.close()
#print 'check liftover file at %s'%liftoverpath
#time.sleep(1000)
#run liftOver
depend+=1
fm.write('jobid{0}=`qsub -l nodes=1{1} -W depend=afterany:$jobid{2} -N lftr3{3} {4}`\n'.format(
depend,
nodetypeqsub,
depend-1,
project_code,
liftoverpath))
fm.write('echo $jobid{0}\n'.format(depend))
################################################################################
add_hg19_path='%s/3_%s_add_hg19.sh'%(tmpdir,os.path.basename(inputvsfile))
ahgf=open(add_hg19_path,'w')
hg19outfile = '{0}/{1}_withhg19'.format(tmpdir,os.path.basename(inputvsfile))
ahgf.write('{0} --func add_hg19 --config {1} --input {2} --liftover {3} --output {4}\n'.format(
sys.argv[0],
configfile,
inputvsfile,
liftoverout,
hg19outfile,
))
ahgf.close()
#add hg19 coordinates to file based on liftOver output
depend+=1
fm.write('jobid{0}=`swarm --jobarray --singleout\
-f {1}\
-W depend=afterany:$jobid{2}\
-l nodes=1{3}\
-N ad19_4{4}\
-b 3`\n'.format(depend,
add_hg19_path,
depend-1,
nodetypeswarm,
project_code)
)
fm.write('echo $jobid{0}\n'.format(depend))
####################################################################################
skip_seattleseq_flag = False
with open(inputvsfile) as myvsfile:
hline = myvsfile.readline()
if ('ss_chromosome' in hline) and ('ss_position' in hline):
skip_seattleseq_flag = True
prep_ss_path='%s/4_%s_prep_seattleSeq.sh'%(tmpdir,os.path.basename(inputvsfile))
fw=open(prep_ss_path,'w')
outpath_snv = os.path.splitext(hg19outfile)[0]+'.ss_snv'
outpath_indel = os.path.splitext(hg19outfile)[0]+'.ss_indel'
fw.write('{0} --func prep_ss --input {1} --output_snv {2} --output_indel {3} --config {4}\n'.format(
sys.argv[0],
hg19outfile,
outpath_snv,
outpath_indel,
configfile
))
fw.close()
#
depend+=1
fm.write('jobid{0}=`qsub -l nodes=1{1} -W depend=afterany:$jobid{2}\
-N pss_51{3} {4}`\n'.format(depend,
nodetypeqsub,
depend-1,
project_code,
prep_ss_path
))
fm.write('echo $jobid{0}\n'.format(depend))
###################################################################################
#incoporate Kaviar, 1000 genomes and allsamples file data into varsifter files
dbsnp_outfile = '{0}_dbsnp'.format(hg19outfile)
annotate_dbsnp_path='%s/5_%s.annotate_with_dbsnp.sh'%(tmpdir,os.path.basename(inputvsfile))
dbsnp_in_file_flag = False
dbsnpheader_in_file_flag = True
if os.path.exists(hg19outfile):
if verbose: print 'hg19 file exists' , hg19outfile
with open(hg19outfile) as myvsfile:
hline = myvsfile.readline()
htl = hline.strip().split('\t')
headers_indbsnp_added_file = [
'Homozygous_VarAllele_UDP_count',
'Single_VarAllele_UDP_count',
'all_mpg:coverage_score',
'dbsnp137G5',
'dbsnp137ClinicalSignificance',
'dbsnp137GMAF',
'dbsnp137RS#',
'dbsnp137',
'predicted_expression_snp',
'varscore',
]
for header_item in headers_indbsnp_added_file:
if header_item not in hline:
dbsnpheader_in_file_flag = False
if verbose: print 'header item not found in the file header item not found=%s'%header_item
if dbsnpheader_in_file_flag:
dbsnp = []
linecounter =0
for line in myvsfile:
tl = line.strip().split('\t')
dbsnp.append(tl[htl.index('dbsnp137')])
if linecounter > 500:
break
myset= set(dbsnp)
if len(myset)>1:
dbsnp_in_file_flag = True
if dbsnp_in_file_flag and dbsnpheader_in_file_flag:
#the header line should be processed and hg19LeftFlank and hg19RightFlank should be present in the file
if verbose: print 'dbsnp in file flag true, dbsnp header in file flag true'
os.system('cp {0} {1}'.format(hg19outfile,dbsnp_outfile))
f_tmp = open(annotate_dbsnp_path,'w')
f_tmp.write('{0} --func annotate_with_dbsnp --config {1} --input {2} --output {3}\n'.format(
sys.argv[0],
configfile,
hg19outfile,
dbsnp_outfile,
))
f_tmp.close()
if not os.path.exists(hg19outfile) or not dbsnp_in_file_flag:
if verbose: print 'hg19 file doesn\'t exist or dbsnp data not in file'
dbsnpheader_in_file_flag = False
f_tmp = open(annotate_dbsnp_path,'w')
f_tmp.write('{0} --func annotate_with_dbsnp --config {1} --input {2} --output {3}\n'.format(
sys.argv[0],
configfile,
hg19outfile,
dbsnp_outfile,
))
f_tmp.close()
depend+=1
if depend>1:
fm.write('jobid{0}=`swarm --jobarray --singleout -f {1}\
-W depend=afterany:$jobid{2} -l nodes=1{3} -N ann6{4}\
-p 100 -b 8 `\n'.format(depend,
annotate_dbsnp_path,
depend-1,
nodetypeswarm,
project_code
))
elif depend<=1:
fm.write('jobid{0}=`swarm --jobarray --singleout -f {1}\
-l nodes=1{2} -N ann6{3} -p 100 -b 8`\n'.format(depend,
annotate_dbsnp_path,
nodetypeswarm,
project_code
))
fm.write('echo $jobid{0}\n'.format(depend))
####################################################################################
allfams_outfile= '{0}_pop'.format(dbsnp_outfile)
allfams_in_file_flag=False
if os.path.exists(dbsnp_outfile):
with open(dbsnp_outfile) as myvsfile:
hline = myvsfile.readline()
htl = hline.strip().split('\t')
linecounter =0
single_VariantAllele_UDP_counts = []
if 'Single_VarAllele_UDP_count' in hline:
for line in myvsfile:
tl = line.strip().split('\t')
single_VariantAllele_UDP_counts.append(tl[htl.index('Single_VarAllele_UDP_count')])
if linecounter > 500:
break
myset= set(single_VariantAllele_UDP_counts)
if len(myset)>1:
allfams_in_file_flag = True
if allfams_in_file_flag:
os.system('cp {0} {1}'.format(dbsnp_outfile,allfams_outfile ))
else:
annotate_allfams_path='%s/6_%s.annotate_with_allfams.sh'%(tmpdir,os.path.basename(inputvsfile))
f_tmpswarm=open(annotate_allfams_path,'w')
f_tmpswarm.write('{0} --func annotate_with_allfams --config {1} --input {2}_dbsnp\
--output {2}_dbsnp_pop\n'.format(
sys.argv[0],
configfile,
hg19outfile,
))
f_tmpswarm.close()
#-------------------------------------------------------------------------------
#incoporate allsamples file data into varsifter files
depend+=1
if depend>1:
fm.write('jobid{0}=`swarm --jobarray --singleout -f {1}\
-W depend=afterany:$jobid{2} -l nodes=1{3} -N af7{4}\
-p 100 -b 2`\n'.format(depend,
annotate_allfams_path,
depend-1,
nodetypeswarm,
project_code
))
elif depend<=1:
fm.write('jobid{0}=`swarm --jobarray --singleout -f {1}\
-l nodes=1{2} -N af7{3} -p 100 -b 2`\n'.format(depend,
annotate_allfams_path,
nodetypeswarm,
project_code
))
fm.write('echo $jobid{0}\n'.format(depend))
###############################################################################
seattle_seq_snv_path='%s/7_%s_seattleseq_snv.sh'%(tmpdir,
os.path.basename(inputvsfile)
)
sssnv_outfile ="%s_ss_snv"%allfams_outfile
ffw=open(seattle_seq_snv_path,'w')
ffw.write('{0} --func annotate_ss_snv --config {1} --input {2}\
--output {3}\n'.format(
sys.argv[0],
configfile,
allfams_outfile,
sssnv_outfile,
))
ffw.close()
if skip_seattleseq_flag == True:
os.system('cp {0} {1}'.format(allfams_outfile ,sssnv_outfile))
depend+=1
if depend>1:
fm.write('jobid{0}=`swarm --jobarray --singleout -f {1}\
-W depend=afterany:$jobid{2} -l nodes=1{3} -N sssnv{4}\
-p 100 -b 2 `\n'.format(depend,
seattle_seq_snv_path,
depend-1,
nodetypeswarm,
project_code
))
elif depend<=1:
fm.write('jobid{0}=`swarm --jobarray --singleout -f {1}\
-l nodes=1{2} -N sssnv{3} -p 100 -b 2 `\n'.format(depend,
seattle_seq_snv_path,
nodetypeswarm,
project_code
))
fm.write('echo $jobid{0}\n'.format(depend))
###############################################################################
seattle_seq_indel_path='%s/8_%s_seattleseq_indel.sh'%(tmpdir,
os.path.basename(inputvsfile)
)
ssindel_outfile ="%s_indel"%sssnv_outfile
ffw=open(seattle_seq_indel_path,'w')
ffw.write('{0} --func annotate_ss_indel --config {1} --input {2}\
--output {3}\n'.format(
sys.argv[0],
configfile,
sssnv_outfile,
ssindel_outfile,
))
ffw.close()
if skip_seattleseq_flag == True:
os.system('cp {0} {1}'.format(sssnv_outfile,ssindel_outfile))
depend+=1
if depend>1:
fm.write('jobid{0}=`swarm --jobarray --singleout -f {1}\
-W depend=afterany:$jobid{2} -l nodes=1{3} -N ssind{4}\
-p 100 -b 2 `\n'.format(depend,
seattle_seq_indel_path,
depend-1,
nodetypeswarm,
project_code
))
elif depend<=1:
fm.write('jobid{0}=`swarm --jobarray --singleout -f {1}\
-l nodes=1{2} -N ssind{3} -p 100 -b 2 `\n'.format(depend,
seattle_seq_indel_path,
nodetypeswarm,
project_code
))
fm.write('echo $jobid{0}\n'.format(depend))
###############################################################################
annotate_with_ESP_path='%s/9_%s_annotate_with_ESP.sh'%(tmpdir,
os.path.basename(inputvsfile)
)
annotate_with_ESP_outfile ="%s_ESP"%ssindel_outfile
ffw = open(annotate_with_ESP_path,'w')
ffw.write('{0} --func annotate_with_ESP --config {1} --input {2}\
--output {3}\n'.format(
sys.argv[0],
configfile,
ssindel_outfile,
annotate_with_ESP_outfile,
))
ffw.close()
depend+=1
if depend>1:
fm.write('jobid{0}=`swarm --jobarray --singleout -f {1}\
-W depend=afterany:$jobid{2} -l nodes=1{3} -N ssind{4}\
-p 100 -b 2 `\n'.format(depend,
annotate_with_ESP_path,
depend-1,
nodetypeswarm,
project_code
))
elif depend<=1:
fm.write('jobid{0}=`swarm --jobarray --singleout -f {1}\
-l nodes=1{2} -N ssind{3} -p 100 -b 2 `\n'.format(depend,
annotate_with_ESP_path,
nodetypeswarm,
project_code
))
fm.write('echo $jobid{0}\n'.format(depend))
#####################################################################################
scandirscdpred_path='%s/10_%s.scandirs_cdpred'%(tmpdir,os.path.basename(inputvsfile))
fscdr = open(scandirscdpred_path,'w')
cdpredoutfile='%s.cdpredneg'%annotate_with_ESP_outfile
fscdr.write('{0} --func scandirs_cdpred --config {1} --input {2} --output {3} \n'.format(
sys.argv[0],
configfile,
annotate_with_ESP_outfile,
cdpredoutfile,
))
fscdr.close()
#-------------------------------------------------------------------------------
#filter for only cdpred negative(predicted deleterious) variants
depend+=1
if depend>1:
fm.write('jobid{0}=`qsub -l nodes=1{1} -W depend=afterany:$jobid{2}\
-N cdn10{3} {4}`\n'.format(depend,
nodetypeqsub,
depend-1,
project_code,
scandirscdpred_path
))
fm.write('echo $jobid{0}\n'.format(depend))
elif depend<=1:
fm.write('jobid{0}=`qsub -l nodes=1{1} -N cdn10{3} {4}`\n'.format(depend,
nodetypeqsub,
depend-1,
project_code,
scandirscdpred_path
))
fm.write('echo $jobid{0}\n'.format(depend))
###################################################################################
#filters for hom_rec, dom and x-linked rec
simple_mendelian_filters_path = '%s/11_%s.mendelian_filters'%(tmpdir,os.path.basename(inputvsfile))
fmhr = open(simple_mendelian_filters_path,'w')
output_path = '{0}.mendelian'.format(cdpredoutfile)
include_bed_file = args.include_bed_file
linkage_bed_file = args.linkage_bed_file
cnv_bed_file = args.cnv_bed_file
if args.nextgene_data:
nextgene_data_flag = "--nextgene_data"
else:
nextgene_data_flag = ""
fmhr.write('{0} --func simple_mendelian_filters --config {1} --input {2}\
--include_bed_file {3} --linkage_bed_file {4} --cnv_bed_file {5} --output {6} {7} {8}\n'.format(
sys.argv[0],
configfile,
cdpredoutfile,
include_bed_file,
linkage_bed_file,
cnv_bed_file,
output_path,
project_code,
nextgene_data_flag,
))
fmhr.close()
depend+=1
if depend>1:
fm.write('jobid{0}=`qsub -l nodes=1{1} -W depend=afterany:$jobid{2}\
-N hm11{3} {4}`\n'.format(depend,
nodetypeqsub,
depend-1,
project_code,
simple_mendelian_filters_path
))
fm.write('echo $jobid{0}\n'.format(depend))
elif depend<=1:
fm.write('jobid{0}=`qsub -l nodes=1{1} -N hm11{3} {4}`\n'.format(depend,
nodetypeqsub,
depend-1,
project_code,
simple_mendelian_filters_path
))
fm.write('echo $jobid{0}\n'.format(depend))
#-------------------------------------------------------------------------------
#filters for hom_rec, dom and x-linked rec and compound heterezygous
comhet_mendelian_filter_path='%s/12_%s.comhet_mendelian_filter'%(tmpdir,os.path.basename(inputvsfile))
fmhr = open(comhet_mendelian_filter_path,'w')
comhet_output = '{0}.com_het'.format(cdpredoutfile)
include_bed_file = args.include_bed_file
linkage_bed_file = args.linkage_bed_file
cnv_bed_file = args.cnv_bed_file
if args.nextgene_data:
nextgene_data_flag="--nextgene_data"
else:
nextgene_data_flag=""
fmhr.write('{0} --func mendelian_filter_com_het_res --config {1} --input {2} --output {3} {4} {5}\n'.format(
sys.argv[0],
configfile,
cdpredoutfile,
comhet_output,
project_code,
nextgene_data_flag,
))
fmhr.close()
depend+=1
if depend>1:
fm.write('jobid{0}=`qsub -l nodes=1{1} -W depend=afterany:$jobid{2}\
-N hm11{3} {4}`\n'.format(depend,
nodetypeqsub,
depend-1,
project_code,
comhet_mendelian_filter_path
))
fm.write('echo $jobid{0}\n'.format(depend))
elif depend<=1:
fm.write('jobid{0}=`qsub -l nodes=1{1} -N hm11{3} {4}`\n'.format(depend,
nodetypeqsub,
depend-1,
project_code,
comhet_mendelian_filter_path
))
fm.write('echo $jobid{0}\n'.format(depend))
#-------------------------------------------------------------------------------
fm.close()
subprocess.call(['chmod','754',masterpath])
#subprocess.call(['/bin/bash',masterpath])
#-------------------------------------------------------------------------------
#sequencial run script
steps = [x for x in range(12)]
if skip_hg19_add_flag == 1:
steps.remove(0)
steps.remove(1)
steps.remove(2)
if skip_seattleseq_flag:
steps.remove(3)
steps.remove(6)
steps.remove(7)
if dbsnpheader_in_file_flag:
steps.remove(4)
if allfams_in_file_flag:
steps.remove(5)
masterpath_seq='%s/%s_master_seq.sh'%(tmpdir,os.path.basename(inputvsfile)) #sequencial run
fm_seq=open(masterpath_seq,'w')
fm_seq.write('#!/usr/bin/env bash\n')
if 0 in steps:
fm_seq.write('{0}\n'.format(prep_liftover_path))
subprocess.call(['chmod','754',prep_liftover_path])
if 1 in steps:
fm_seq.write('{0}\n'.format(liftoverpath))
subprocess.call(['chmod','754',liftoverpath])
if 2 in steps:
fm_seq.write('{0}\n'.format(add_hg19_path))
subprocess.call(['chmod','754',add_hg19_path])
if 3 in steps:
fm_seq.write('{0}\n'.format(prep_ss_path))
subprocess.call(['chmod','754',prep_ss_path])
if 4 in steps:
fm_seq.write('{0}\n'.format(annotate_dbsnp_path))
subprocess.call(['chmod','754',annotate_dbsnp_path])
if 5 in steps:
fm_seq.write('{0}\n'.format(annotate_allfams_path))
subprocess.call(['chmod','754',annotate_allfams_path])
if 6 in steps:
fm_seq.write('{0}\n'.format(seattle_seq_snv_path))
subprocess.call(['chmod','754',seattle_seq_snv_path])
if 7 in steps:
fm_seq.write('{0}\n'.format(seattle_seq_indel_path))
subprocess.call(['chmod','754',seattle_seq_indel_path])
if 8 in steps:
fm_seq.write('{0}\n'.format(annotate_with_ESP_path))
subprocess.call(['chmod','754',annotate_with_ESP_path])
if 9 in steps:
fm_seq.write('{0}\n'.format(scandirscdpred_path))
subprocess.call(['chmod','754',scandirscdpred_path])
if 10 in steps:
fm_seq.write('{0}\n'.format(simple_mendelian_filters_path))
subprocess.call(['chmod','754',simple_mendelian_filters_path])
if 11 in steps:
fm_seq.write('{0}\n'.format(comhet_mendelian_filter_path))
subprocess.call(['chmod','754',comhet_mendelian_filter_path])
fm_seq.close()
subprocess.call(['chmod','754',masterpath_seq])
print 'varmd_input_file_path:{0}'.format(inputvsfile)
print 'varmd_distributed_qsub_script_path:{0}'.format(masterpath)
print 'varmd_sequential_script_path:{0}'.format(masterpath_seq)
#subprocess.call(['/bin/bash',masterpath_seq])
def prep_seattle_seq_input(hg19file, outpath_snv, outpath_indel, configfile):
"""
creates the input files that need to be uploaded to Seatlle Seq web site for annotation
If in the first pass a seattle seq input file is generated, run those files through seattleseq web site and
put the seattleseq output files in the same directory with the main annotation file and run this function again
it will look for the incremental annotation file and merge it with the main file
"""
sys.stderr.write('NOTICE: varmd(prepare_seattleseq_input) started for %s\n'%hg19file)
import gzip
myfile = open(hg19file)
hline = myfile.readline()
htl = hline.strip().split('\t')
snpcounter=0
indelcounter=0
outfile_snv = open(outpath_snv,'w')
outfile_indel = open(outpath_indel,'w')
snvdict={}
indeldict={}
filesdict={}
filessnvcount=0
filesindelcount=0
filesnvsize1=0
fileindelsize1=0
bigfilesnvsize=0
bigfileindelsize=0
totalsnvlines=[]
totalindellines=[]
headersnv1=[]
headersnv2=[]
headerindel1=[]
headerindel2=[]
for file_ in glob.glob(os.path.join(mycfg.seattleseq_dir_path,'*')):
if ('SeattleSeqAnnotation' in file_) and ('ss_snv' in file_):
sys.stderr.write('found snv file %s\n'%file_)
filessnvcount+=1
filesdict[file_]={}
filesdict[file_]['size']=os.path.getsize(file_)
sys.stderr.write('Reading %s into memory\n'%file_)
ss_snv_lines = gzip.open(file_,'rb').readlines()
filesdict[file_]['type']='snv'
if filessnvcount==1:
filesnvsize1=os.path.getsize(file_)
totalsnvlines.extend(ss_snv_lines)
headersnv1=ss_snv_lines[0]
elif filessnvcount==2:
sys.stderr.write('Two SeattleSeqAnnoattion files were found, consolidating ...\n')
bigfilesnvsize = max(os.path.getsize(file_),filesnvsize1)
totalsnvlines.extend(ss_snv_lines[1:])
headersnv2=ss_snv_lines[0]
if not headersnv1==headersnv2:
print 'The two snv annotation files do not have the same headers,please use files created by the same SeattleSeq version'
htl1 = headersnv1.strip().split('\t')
htl2 = headersnv2.strip().split('\t')
print 'file1 has %d headers, file2 has %d headers'%(len(htl1), len(htl2))
raise Exception
elif filessnvcount>2:
raise Exception,'more than 2 SeattleSeq annotation files were found, please use only two files for incremental annotation'
snvhtl = ss_snv_lines[0].strip().split('\t')
for snvline in ss_snv_lines:
if not snvline.startswith('#'):
snvtl = snvline.strip().split('\t')
snvdict[(snvtl[snvhtl.index('chromosome')],snvtl[snvhtl.index('position')])] = snvtl
elif (('SeattleSeqAnnotation' in file_) and ('ss_indel' in file_)):
sys.stderr.write('found indel file %s\n'%file_)
filesindelcount+=1
filesdict[file_]={}
filesdict[file_]['size']=os.path.getsize(file_)
filesdict[file_]['type']='indel'
ss_indel_lines=gzip.open(file_,'rb').readlines()
if filesindelcount==1:
fileindelsize1=os.path.getsize(file_)
totalindellines.extend(ss_indel_lines)
headerindel1=ss_indel_lines[0]
elif filesindelcount==2:
bigfileindelsize = max(os.path.getsize(file_),fileindelsize1)
totalindellines.extend(ss_indel_lines[1:])
headerindel2=ss_indel_lines[0]
assert headerindel1==headerindel2,'The two indel annotation files do not have the same headers,please use files created by the same SeattleSeq version'
elif filesindelcount>2:
raise Exception,'more than 2 SeattleSeq annotation files were found, please use only two files for incremental annotation'
indelhtl = ss_indel_lines[0].strip().split('\t')
for indelline in ss_indel_lines:
if not indelline.startswith('#'):
indeltl = indelline.strip().split('\t')
indeldict[(indeltl[indelhtl.index('chromosome')],
indeltl[indelhtl.index('position')],
indeltl[indelhtl.index('sampleAlleles')])] = indeltl
for k,v in filesdict.iteritems():
if v['size'] == bigfilesnvsize and v['type']=='snv':
fb = gzip.open(k,'wb')
fb.writelines(totalsnvlines)
fb.close()
elif v['size'] == bigfileindelsize and v['type']=='indel':
fb = gzip.open(k,'wb')
fb.writelines(totalindellines)
fb.close()
for k,v in filesdict.iteritems():
if v['size']!=bigfilesnvsize and v['type']=='snv' and headersnv2:#delete the smaller file only if the second file header exists
os.system('rm %s'%k)
elif v['size']!=bigfileindelsize and v['type']=='indel' and headerindel2: #delete the smaller file only if the second file header exists
os.system('rm %s'%k)
sys.stderr.write('consolidation of seattleseq annotation files is complete, starting to read input file\n')
#for tl in [x.strip().split('\t') for x in mylines[1:]]:
linecounter=0
inthemasterfilecounter=0
for line in myfile:
linecounter+=1
tl = line.strip().split('\t')
inserted=None
deleted=None
# if tl[htl.index('hg19LeftFlank')]=='207881485':
# print tl[htl.index('ref_allele')]
# print len(tl[htl.index('ref_allele')])
# print "'" in tl[htl.index('ref_allele')]
# print tl[htl.index('var_allele')]
# print len(tl[htl.index('var_allele')])
# print type(tl[htl.index('var_allele')])
# raise Exception
if tl[htl.index('hg19LeftFlank')]!='-':
flag=0
if (len(tl[htl.index('var_allele')]) == 1 and
tl[htl.index('var_allele')] != '-' and
tl[htl.index('var_allele')] != '*' and
tl[htl.index('ref_allele')] in ['A','T', 'C','G']):
if (tl[htl.index('Chr')].replace('chr',''),str(int(tl[htl.index('hg19LeftFlank')])+1)) in snvdict:
inthemasterfilecounter+=1
#print 'I found the position in the ss_snv file, passing'
#time.sleep(1)
else:
#print (tl[htl.index('Chr')].replace('chr',''),str(int(tl[htl.index('hg19LeftFlank')])+1))
#time.sleep(1)
outfile_snv.write('\t'.join( [tl[htl.index('Chr')],
str(int(tl[htl.index('hg19LeftFlank')])+1),
tl[htl.index('var_allele')]] )+'\n' )
snpcounter+=1
flag=1
# the location or position in the seattle seq file if the left flank position of the indel
#in other words the positions one base before the indel
elif "'" in tl[htl.index('ref_allele')]:
if (tl[htl.index('Chr')].replace('chr',''),
tl[htl.index('hg19LeftFlank')],
'+%s:0/0'%tl[htl.index('var_allele')]) in indeldict:
#print 'I found this indel in the dictioanry'
pass
else:
outfile_indel.write('\t'.join( [tl[htl.index('Chr')],
tl[htl.index('hg19LeftFlank')],
tl[htl.index('hg19LeftFlank')],
'+%s:0/0'%tl[htl.index('var_allele')]] )+'\n' )
indelcounter+=1
flag=2
elif (tl[htl.index('var_allele')]!="''" and
len(tl[htl.index('ref_allele')]) < len(tl[htl.index('var_allele')])):
inserted = tl[htl.index('var_allele')].replace(tl[htl.index('ref_allele')],'',1)
if (tl[htl.index('Chr')].replace('chr',''),
tl[htl.index('hg19LeftFlank')],
'+%s:0/0'%inserted) in indeldict:
#print 'I found this indel in the dictioanry'
pass
else:
outfile_indel.write('\t'.join( [tl[htl.index('Chr')],
tl[htl.index('hg19LeftFlank')],
tl[htl.index('hg19LeftFlank')],
'+%s:0/0'%inserted])+'\n')
indelcounter+=1
flag=3
elif "'" in tl[htl.index('var_allele')]:
if (tl[htl.index('Chr')].replace('chr',''),
tl[htl.index('hg19LeftFlank')],
'-%s:0/0'%tl[htl.index('ref_allele')]) in indeldict:
#print 'I found this indel in the dictioanry'
pass
else:
outfile_indel.write('\t'.join( [tl[htl.index('Chr')],
tl[htl.index('hg19LeftFlank')],
str(int(tl[htl.index('hg19LeftFlank')]) + len(tl[htl.index('ref_allele')])),
'-%s:0/0'%tl[htl.index('ref_allele')]] )+'\n' )
indelcounter+=1
flag=4
elif (tl[htl.index('var_allele')]!="''" and
len(tl[htl.index('var_allele')]) < len(tl[htl.index('ref_allele')])):
deleted=tl[htl.index('ref_allele')].replace(tl[htl.index('var_allele')],'',1)
if (tl[htl.index('Chr')].replace('chr',''),
tl[htl.index('hg19LeftFlank')],
'-%s:0/0'%deleted) in indeldict:
#print 'I found this indel in the dictioanry'
inthemasterfilecounter+=1
else:
outfile_indel.write('\t'.join( [tl[htl.index('Chr')],
tl[htl.index('hg19LeftFlank')],
str(int(tl[htl.index('hg19LeftFlank')])+len(tl[htl.index('var_allele')])),
'-%s:0/0'%deleted])+'\n')
indelcounter+=1
flag=5
#if tl[htl.index('hg19LeftFlank')]=='207881485':
# print flag
outfile_snv.close()
outfile_indel.close()
sys.stderr.write ('total number of lines processed = %s \n'%linecounter)
sys.stderr.write('number of exported snps %s \n'%snpcounter)
sys.stderr.write('number of exported indels %s \n'%indelcounter)
sys.stderr.write('number of variants not exported is %s \n'%(linecounter - (snpcounter+indelcounter+1)))
sys.stderr.write('%s of these not exported were already in the masterfile \n'%(inthemasterfilecounter))
def check_varsifter_file_format(input_file):
flag = True
myfile =open(input_file)
hline = myfile.readline()
htl = hline.strip().split('\t')
assert '\t' in hline,('The file is not tab delimited, please see varmd file specifications')
myvarfile = varfile(htl)
patient_pos = myvarfile.first_sample_index
assert isinstance(patient_pos,int), 'Returned patient position is not a number\
check your cohort(allfams) file'
not_found_list=[]
csnum=801 #number of Clinseq samples used in diferent varsiftr headers
varsifter_format = ['Index','Chr','LeftFlank','RightFlank','ref_allele','muttype','var_allele','type',
'Gene_name','loc_type','consequence',
'CS_%sc_genotypes'%csnum,
'CS_%sc_homref'%csnum,
'CS_%sc_het'%csnum,
'CS_%sc_homvar'%csnum,
'CS_%sc_hetnonref'%csnum,
'CS_%sc_other'%csnum,
'CS_%sc_refallele'%csnum,
'CS_%sc_varallele'%csnum,
'CS_%sfreq_homref'%csnum,
'CS_%sfreq_het'%csnum,
'CS_%sfreq_homvar'%csnum,
'CS_%sfreq_hetnonref'%csnum,
'CS_%sfreq_refallele'%csnum,
'CS_%sfreq_varallele'%csnum,
'CS_%sref_is_minor'%csnum,
'CS_%sc_major'%csnum,
'CS_%sc_minor'%csnum,
'CS_%smaf'%csnum,
'CS_%schisquare'%csnum,
'transcript', 'ref_aa', 'var_aa', 'aa_pos', 'CDPred_score', 'dbID', 'HGMDids', 'HGMDdisease', 'HGMDtags',
'HGMD_HGVS', 'REF_MATCHES_HGMD_DISEASE_ALLELE', 'HGMD_ALLELES_in_GENOME_STRAND', 'HGMDinGene', '1K_total_depth_at_site',
'1K_rms_mapping_quality', '1K_#samples_with_covg', '1K_#alleles_in_samples_with_covg', '1K_alt_allele_cnts_in_samples_with_covg',
'1K_alt_allele_freqs', '1K_allele_balance_in_hets']
for item in varsifter_format:
found = 0
for x in htl:
if item == x:
found = 1
break
if found == 0:
not_found_list.append(item)
for item in htl:
if item.startswith('UDP') and not (item.endswith('.NA') or item.endswith('.NA.score') or item.endswith('.NA.coverage')):
flag = False
print item
if not_found_list:
print 'Following items are not found in the input file, please make sure the file is in the right format'
for item in not_found_list:
print item
flag = False
return flag
def fileslist(filesdir,myext,myexcludeinfile,myexcludeinroot,matchbase=''):
"""
given a directory to look into "filesdir", an extention to match
it returns a list with
"""
verbose=0
if verbose:
print 'filesdir',filesdir
print 'myext',myext
print 'myexcludeinfile',myexcludeinfile
print 'myexcludeinroot',myexcludeinroot
files1=[]
#print os.path.join(filesdir,'*/*.bam')
#for file in glob.glob('~/../*.bam'):
#print file
#time.sleep(1)
for root,dir,files in os.walk(filesdir):
if files:
for file in files:
#if os.path.splitext(file)[1]=='%s'%myext and '%s'%myexcludeinfile not in file and '%s'%myexcludeinroot not in root:
if os.path.splitext(file)[1] == myext and myexcludeinroot not in root:
exclude_flag =0
for x in myexcludeinfile:
if x in file:
exclude_flag=1
if not exclude_flag:
if matchbase:
if re.search(r'%s[0-9]*'%matchbase,os.path.splitext(file)[0]):
#print '%s/%s'%(root,file)
files1.append('%s/%s'%(root,file))
else:
pass
#print 'regular expression pattern not matched'
#print 'root 2',root,'file 2',file
#time.sleep(0.2)
else:
if verbose:print '*** root:',root,'*** file:',file
print '%s/%s'%(root,file)
files1.append('%s/%s'%(root,file))
#print 'root',root;print '\ndir',dir;print '\nfiles',files
#time.sleep(1)
#print 'from function fileslist files1 is ',files1, len(files1)
#raise Exception
return files1
def varmd2vcf():
"""
creates a vcf file from the varsifter/varmd interval format
"""
format = check_varsifter_file_format()
if format == True:
hgref ={chr:{} for chr in chrs}
myfasta = open(args.reference).readlines()
counter=0
for line in myfasta[:10000]:
counter+=1
if line.startswith('>'):
basepos=0
chr = line.strip().replace('>','')
#print chr
else:
hgref[chr][(basepos+1,basepos+50)] = line.strip()
basepos+=50
#for k,v in hgref.iteritems():
# print k
# print v
if counter %100000 == 0:
print counter
print hgref['chr1'][(51,100)]
def get_bases(key):
left = int(key[1])
right= int(key[2])
cdleft = left / 50
cdright = right / 50
if cdleft == 0 and cdright == 0:
return hgref[key[0]][(key[1],key[2])]
elif cdleft == 0 and cdright > 0:
first = hgref[key[0]][(key[1],50)]
print 'first', first
times = ((right - left ) /50 ) + 1
for time_ in xrange(times):
print ((left/50)+1)*50 , (((left/50)+1)*50 )+50
first = hgref[key[0]][(key[1],50)]
else:
return False
a = get_bases(['chr1', 1,50])
print a
#for k,v in hgref.iteritems():
# for k1,v1 in v.iteritems():
# print v1[:300]
# time.sleep(2)
fw = open(args.output_file, 'w')
fw.write('##fileformat=VCFv4.0\n')
fw.write('##fileDate=%s\n'%time.ctime())
fw.write('##source=VAR-MD\n')
fw.write('##reference=hg19-liftOver\n')
fw.write('##INFO=\n')
fw.write('\t'.join(['CHROM','POS','ID','REF','ALT', 'QUAL', 'FILTER', 'INFO'])+'\n')
myfile =open(args.input_file)
hline = myfile.readline()
assert '\t' in hline,('The file is not tab delimited, please see varmd file specifications')
htl = hline.strip().split('\t')
counter=0
for line in myfile:
counter+=1
if counter >100:
break
tl = line.strip().split('\t')
CHR = tl[htl.index('Chr')].replace('Chr','')
if chr == "M":
chr == "MT"
ID = '.'
REF = tl[htl.index('ref_allele')]
QUAL = "."
FILTER = "."
INFO = '.'
if tl[htl.index('muttype')]=='SNP':
POS = str(int(tl[htl.index('LeftFlank')])+1)
ALT = tl[htl.index('var_allele')]
elif tl[htl.index('muttype')]=='INDEL':
pass
raise Exception
POS = str(int(tl[htl.index('LeftFlank')])+1)
ALT = ''
if "'" in tl[htl.index('ref_allele')]:
outfile_indel.write('\t'.join( [tl[htl.index('Chr')],
tl[htl.index('hg19LeftFlank')],
tl[htl.index('hg19LeftFlank')],
'+%s:0/0'%tl[htl.index('var_allele')]] )+'\n' )
indelcounter+=1
flag=2
elif (tl[htl.index('var_allele')]!="''" and
len(tl[htl.index('ref_allele')]) < len(tl[htl.index('var_allele')])):
inserted=tl[htl.index('var_allele')].replace(tl[htl.index('ref_allele')],'',1)
if (tl[htl.index('Chr')].replace('chr',''),
tl[htl.index('hg19LeftFlank')],
'+%s:0/0'%inserted) in indeldict:
#print 'I found this indel in the dictioanry'
pass
else:
outfile_indel.write('\t'.join( [tl[htl.index('Chr')],
tl[htl.index('hg19LeftFlank')],
tl[htl.index('hg19LeftFlank')],
'+%s:0/0'%inserted])+'\n')
indelcounter+=1
flag=3
elif "'" in tl[htl.index('var_allele')]:
if (tl[htl.index('Chr')].replace('chr',''),
tl[htl.index('hg19LeftFlank')],
'-%s:0/0'%tl[htl.index('ref_allele')]) in indeldict:
#print 'I found this indel in the dictioanry'
pass
else:
outfile_indel.write('\t'.join( [tl[htl.index('Chr')],
tl[htl.index('hg19LeftFlank')],
str(int(tl[htl.index('hg19LeftFlank')]) + len(tl[htl.index('ref_allele')])),
'-%s:0/0'%tl[htl.index('ref_allele')]] )+'\n' )
indelcounter+=1
flag=4
elif (tl[htl.index('var_allele')]!="''" and
len(tl[htl.index('var_allele')]) < len(tl[htl.index('ref_allele')])):
deleted=tl[htl.index('ref_allele')].replace(tl[htl.index('var_allele')],'',1)
if (tl[htl.index('Chr')].replace('chr',''),
tl[htl.index('hg19LeftFlank')],
'-%s:0/0'%deleted) in indeldict:
#print 'I found this indel in the dictioanry'
inthemasterfilecounter+=1
else:
outfile_indel.write('\t'.join( [tl[htl.index('Chr')],
tl[htl.index('hg19LeftFlank')],
str(int(tl[htl.index('hg19LeftFlank')])+len(tl[htl.index('var_allele')])),
'-%s:0/0'%deleted])+'\n')
indelcounter+=1
fw.write('\t'.join([CHR, POS, ID, REF, ALT, QUAL, FILTER, INFO])+'\n')
else:
raise Exception
def split_by_chrs():
myfile = open(args.input_file)
hline = myfile.readline()
htl = hline.strip().split('\t')
outfiles ={}
for chr in chrs:
outfiles[chr]= open('%s.%s'%(args.input_file, chr), 'w')
outfiles[chr].write(hline)
myfile = open(args.input_file)
myfile.readline()
for line in myfile:
tl= line.strip().split('\t')
outfiles[tl[htl.index('Chr')]].write(line)
def combine_by_chrs():
myfile = open(args.output_file)
outfiles ={}
for chr in chrs:
outfiles[chr]= open('%s.%s'%(args.input_file.replace('_dbsnp',''), chr))
hline = outfiles['chr1'].readline()
myfile.write(hline)
for line in myfile:
tl= line.strip().split('\t')
outfiles[tl[htl.index('Chr')]].write(line)
if __name__=="__main__":
#this token is to decide if total time it took for the program to run would be printed or not.
arguments()
mycfg = config(args.config_file)
engine = create_engine(mycfg.dbconnect, pool_recycle=600) #pool_recycle values are in seconds
metadata = MetaData(bind=engine)
if args.verbose:
print 'Function=',args.function_name
print 'Arguments=',args.args
if not os.path.exists(args.config_file):
print """Configuration file was not found at %s
Please make sure the configuration file is at the same directory as varpy
, the path is provided with --config and that it contains correct data"""%args.config_file
sys.exit()
if args.downdb != '':
downdb()
if args.function_name == 'usage' and len(args.args) == 0:
usage()
endtimeprinttoken=0
elif args.function_name=='split_by_chrs':
split_by_chrs()
elif args.function_name=='prepare_liftover_input':
prepare_liftover_input(args.input_file, args.output_file)
elif args.function_name=='add_hg19':
#add_hg19(args[0],args[1],args[2],args[3])
add_hg19(args.input_file, args.liftover, args.output_file)
elif args.function_name == 'prep_ss':
prep_seattle_seq_input(args.input_file, args.output_snv, args.output_indel, args.config_file)
elif args.function_name =='annotate_ss_snv':
annotate_seattle_seq_snv(args.input_file, args.output_file)
elif args.function_name =='annotate_ss_indel':
annotate_seattle_seq_indel(args.input_file, args.output_file)
elif args.function_name=='scandirs_cdpred':
scandirs_cdpred(args.input_file, args.output_file)
elif args.function_name=='annotate_with_dbsnp':
annotate_with_dbsnp(args.input_file, args.output_file)
elif args.function_name=='annotate_with_ESP':
annotate_with_ESP(args.input_file, args.output_file)
elif args.function_name=='annotate_with_allfams':
annotate_with_allfams(args.input_file,args.output_file)
elif args.function_name=='simple_mendelian_filters':
simple_mendelian_filters(args.input_file, args.args[0],
include_bed_file = args.include_bed_file,
linkage_bed_file = args.linkage_bed_file,
cnv_bed_file = args.cnv_bed_file)
elif args.function_name=='annotate_with_mendelian_filters':
annotate_with_mendelian_filters(args.input_file, args.output_file, args.args[0],
include_bed_file = args.include_bed_file,
linkage_bed_file = args.linkage_bed_file,
cnv_bed_file = args.cnv_bed_file)
elif args.function_name=='mendelian_filter_com_het_res':
mendelian_filter_com_het_res(args.input_file, args.output_file,args.args[0],
include_bed_file = args.include_bed_file,
linkage_bed_file = args.linkage_bed_file,
cnv_bed_file = args.cnv_bed_file)
#cProfile.run('split_allfams_byfamily()')
elif args.function_name=='create_allsamples_db':
create_allsamples_db()
elif args.function_name=='create_allsamples_db_mysql':
create_allsamples_db_mysql()
elif args.function_name=='update_allfams':
update_allfams(args[0])
elif args.function_name=='update_local_freq_file':
update_local_freq_file(args[0])
elif args.function_name=='dbsnp_vcf2db':
dbsnp_vcf2db()
elif args.function_name=='ESP_vcf2db':
ESP_vcf2db()
elif args.function_name=='varmd2vcf':
varmd2vcf()
elif args.function_name=='annotate_with_interval_data':
annotate_with_interval_data()
elif args.function_name=='split_allfams_byfamily':
split_allfams_byfamily()
elif args.function_name=='split_allfams_bysample':
split_allfams_bysample()
elif args.function_name=='run_analysis':
#args 0 path to file
#prep_file(args[0],args[1],args[2])
run_analysis(args.config_file)
elif args.function_name=='run_analysis_single_file':
run_analysis_single_file(args.config_file)
elif args.function_name=='fix_nextgene':
if args:
fix_nextgene(args[0])
else:
print 'varmd.py --func fix_nextgene '
else:
print 'The function names and arguments you provided were not recognised as legitimate function calls and parameters'
usage()
endall=time.time()
if args.verbose:
sys.stdout.write("\nIt took me %.2f minutes to complete the entire operation \n" %((endall-startall)/60))