#!/usr/bin/env python ####################################################################### ####################################################################### # VAR-MD : Variant Analysis in Rare and Mendelian Diseases # Developed by Murat Sincan (C) 2012 # msincan@gmail.com # # homepage: http://code.google.com/p/var-md/ # This program is described in: # Sincan Et al. 2012 Human Mutation doi: 10.1002/humu.2203 # # This file is part of VAR-MD # VAR-MD is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see . ####################################################################### ####################################################################### import argparse from copy import deepcopy import cProfile from fractions import Fraction import glob import os import string import shutil import shlex import subprocess import sys import smtplib import time import re import urllib import urllib2 from HTMLParser import HTMLParser import sqlite3 from sqlalchemy import create_engine from sqlalchemy import func from sqlalchemy import Table, Column, Integer, Float, String, MetaData, ForeignKey, Index, and_, or_, not_ from sqlalchemy.orm import mapper if not (sys.version_info.major==2 and sys.version_info.minor==7): print 'VAR-MD requires python 2.7, current python version is %s.%s'%(sys.version_info.major,sys.version_info.minor) sys.exit(1) startdate=time.ctime() startall=time.time() #------------------------------------------------------------------------------- chrs=["chr"+str(x) for x in range(1,23)] chrs.extend(["chrX","chrY","chrM"]) chrs_random=["chr"+str(x) for x in range(1,23)] chrs_random.extend(["chrX","chrY","chrM","chrX_random","chrY_random","chrM_random"]) chrs_random.extend(["chr%s_random"%str(x) for x in range(1,23)]) def arguments(): global options global args parser = argparse.ArgumentParser(description = 'VAR-MD: Clinical and research analysis of \ next-gen sequencing data for Mendelian diseases in pedigrees') parser.add_argument("--func", dest="function_name", default="usage", help="Function name to be run") parser.add_argument("--cleanup", dest="cleanup", default = 'no', help="--cleanup yes => program will cleanup intermideate/temporary files, default is no") parser.add_argument("--database", dest="database", help="""define path\ to databases as --database [path_to_db]""") parser.add_argument('--downdb', dest = 'downdb', default ='', help = 'downloads UCSC annotation tables') parser.add_argument('--buildver',dest = 'buildver', default = 'hg19', help = 'specify a valid genome build (hg18, hg19)') parser.add_argument('--dbdir',dest = 'dbdir', default = 'dbdir', help = 'database directory where the public annoatation tables reside') parser.add_argument('--input', dest = 'input_file', default = '', help = 'path for the input file to be processed') parser.add_argument('--config', dest = 'config_file', default = os.path.join(os.path.dirname(sys.argv[0]),'varmd.cfg'), help = 'absolute path for the configuration file') parser.add_argument('--projects_metadata', dest = 'projects_metadata_file', default = '', help = 'absolute path for the projects metadata file') parser.add_argument('--output', dest = 'output_file', default = '', help = 'path for the output file to be generated') parser.add_argument('--output_snv', dest = 'output_snv', default = '', help = 'path for the output file for SeattleSeq SNV, must have .ss_snv extention') parser.add_argument('--output_indel', dest = 'output_indel', default = '', help = 'path for the output file for SeattleSeq indels, must have .ss_indel extention') parser.add_argument('--liftover', dest = 'liftover', default = '', help = 'path for the output bed file that contains the coordinates\ generated by liftover program') parser.add_argument('--outdir', dest = 'output_dir', default = '', help = 'path for the output_dir') parser.add_argument('--include_bed_file', dest = 'include_bed_file', default = 'None', help = 'path for the include_bed_file') parser.add_argument('--linkage_bed_file', dest = 'linkage_bed_file', default = 'None', help = 'path for the linkage_bed_file') parser.add_argument('--cnv_bed_file', dest = 'cnv_bed_file', default = 'None', help = 'path for the cnv_bed_file') parser.add_argument('--annotation_file', dest = 'annotation_file', default = 'None', help = 'path for the intervals based annotations file') parser.add_argument('--reference', dest = 'reference', default = '', help = 'path for the reference genome fasta file') parser.add_argument('--version', action = 'version', version = '0.1', help = 'Prints version') parser.add_argument('--project_code', dest = 'project_code', default = '', help = 'project code to be used in the analysis') parser.add_argument('--allow_missing', action = 'store_true', help = 'allow missing samples from data') parser.add_argument('--extract_subject_only', action = 'store_true', help = 'extract only the subject (do not include data for other subjects in the output) when used with split_allfams_bysample') parser.add_argument('--nextgene_data', action = 'store_true', help = 'data analyzed is generated by next gene: applies special criteria to accomodate NextGene variant reports') parser.add_argument('--verbose', action = 'store_true', help = 'sets verbosity') parser.add_argument('args', metavar = 'args', nargs = '*', help = 'all unnamed positional arguments') args = parser.parse_args() #print args #print args.function_name #time.sleep(10000) return args """ codon1 = {TTT:"F", TTC:"F", TCT:"S", TCC:"S", TAT:"Y", TAC:"Y", TGT:"C", TGC:"C", TTA:"L", TCA:"S", TAA:"*", TGA:"*", TTG:"L", TCG:"S", TAG:"*", TGG:"W", CTT:"L", CTC:"L", CCT:"P", CCC:"P", CAT:"H", CAC:"H", CGT:"R", CGC:"R", CTA:"L", CTG:"L", CCA:"P", CCG:"P", CAA:"Q", CAG:"Q", CGA:"R", CGG:"R", ATT:"I", ATC:"I", ACT:"T", ACC:"T", AAT:"N", AAC:"N", AGT:"S", AGC:"S", ATA:"I", ACA:"T", AAA:"K", AGA:"R", ATG:"M", ACG:"T", AAG:"K", AGG:"R", GTT:"V", GTC:"V", GCT:"A", GCC:"A", GAT:"D", GAC:"D", GGT:"G", GGC:"G", GTA:"V", GTG:"V", GCA:"A", GCG:"A", GAA:"E", GAG:"E", GGA:"G", GGG:"G"} codon3 = {TTT:"Phe", TTC:"Phe", TCT:"Ser", TCC:"Ser", TAT:"Tyr", TAC:"Tyr", TGT:"Cys", TGC:"Cys", TTA:"Leu", TCA:"Ser", TAA:"*", TGA:"*", TTG:"Leu", TCG:"Ser", TAG:"*", TGG:"Trp", CTT:"Leu", CTC:"Leu", CCT:"Pro", CCC:"Pro", CAT:"His", CAC:"His", CGT:"Arg", CGC:"Arg", CTA:"Leu", CTG:"Leu", CCA:"Pro", CCG:"Pro", CAA:"Gln", CAG:"Gln", CGA:"Arg", CGG:"Arg", ATT:"Ile", ATC:"Ile", ACT:"Thr", ACC:"Thr", AAT:"Asn", AAC:"Asn", AGT:"Ser", AGC:"Ser", ATA:"Ile", ACA:"Thr", AAA:"Lys", AGA:"Arg", ATG:"Met", ACG:"Thr", AAG:"Lys", AGG:"Arg", GTT:"Val", GTC:"Val", GCT:"Ala", GCC:"Ala", GAT:"Asp", GAC:"Asp", GGT:"Gly", GGC:"Gly", GTA:"Val", GTG:"Val", GCA:"Ala", GCG:"Ala", GAA:"Glu", GAG:"Glu", GGA:"Gly", GGG:"Gly"} codonfull = {TTT:"Phenylalanine", TTC:"Phenylalanine", TCT:"Serine", TCC:"Serine", TAT:"Tyrosine", TAC:"Tyrosine", TGT:"Cysteine", TGC:"Cysteine", TTA:"Leucine", TCA:"Serine", TAA:"Stop", TGA:"Stop", TTG:"Leucine", TCG:"Serine", TAG:"Stop", TGG:"Tryptophan", CTT:"Leucine", CTC:"Leucine", CCT:"Proline", CCC:"Proline", CAT:"Histidine", CAC:"Histidine", CGT:"Arginine", CGC:"Arginine", CTA:"Leucine", CTG:"Leucine", CCA:"Proline", CCG:"Proline", CAA:"Glutamine", CAG:"Glutamine", CGA:"Arginine", CGG:"Arginine", ATT:"Isoleucine", ATC:"Isoleucine", ACT:"Threonine", ACC:"Threonine", AAT:"Asparagine", AAC:"Asparagine", AGT:"Serine", AGC:"Serine", ATA:"Isoleucine", ACA:"Threonine", AAA:"Lysine", AGA:"Arginine", ATG:"Methionine", ACG:"Threonine", AAG:"Lysine", AGG:"Arginine", GTT:"Valine", GTC:"Valine", GCT:"Alanine", GCC:"Alanine", GAT:"Aspartic acid", GAC:"Aspartic acid", GGT:"Glycine", GGC:"Glycine", GTA:"Valine", GTG:"Valine", GCA:"Alanine", GCG:"Alanine", GAA:"Glutamic acid", GAG:"Glutamic acid", GGA:"Glycine", GGG:"Glycine"} codonr1 = {UUU:"F", UUC:"F", UCU:"S", UCC:"S", UAU:"Y", UAC:"Y", UGU:"C", UGC:"C", UUA:"L", UCA:"S", UAA:"*", UGA:"*", UUG:"L", UCG:"S", UAG:"*", UGG:"W", CUU:"L", CUC:"L", CCU:"P", CCC:"P", CAU:"H", CAC:"H", CGU:"R", CGC:"R", CUA:"L", CUG:"L", CCA:"P", CCG:"P", CAA:"Q", CAG:"Q", CGA:"R", CGG:"R", AUU:"I", AUC:"I", ACU:"T", ACC:"T", AAU:"N", AAC:"N", AGU:"S", AGC:"S", AUA:"I", ACA:"T", AAA:"K", AGA:"R", AUG:"M", ACG:"T", AAG:"K", AGG:"R", GUU:"V", GUC:"V", GCU:"A", GCC:"A", GAU:"D", GAC:"D", GGU:"G", GGC:"G", GUA:"V", GUG:"V", GCA:"A", GCG:"A", GAA:"E", GAG:"E", GGA:"G", GGG:"G"} codonr3 = {UUU:"Phe", UUC:"Phe", UCU:"Ser", UCC:"Ser", UAU:"Tyr", UAC:"Tyr", UGU:"Cys", UGC:"Cys", UUA:"Leu", UCA:"Ser", UAA:"*", UGA:"*", UUG:"Leu", UCG:"Ser", UAG:"*", UGG:"Trp", CUU:"Leu", CUC:"Leu", CCU:"Pro", CCC:"Pro", CAU:"His", CAC:"His", CGU:"Arg", CGC:"Arg", CUA:"Leu", CUG:"Leu", CCA:"Pro", CCG:"Pro", CAA:"Gln", CAG:"Gln", CGA:"Arg", CGG:"Arg", AUU:"Ile", AUC:"Ile", ACU:"Thr", ACC:"Thr", AAU:"Asn", AAC:"Asn", AGU:"Ser", AGC:"Ser", AUA:"Ile", ACA:"Thr", AAA:"Lys", AGA:"Arg", AUG:"Met", ACG:"Thr", AAG:"Lys", AGG:"Arg", GUU:"Val", GUC:"Val", GCU:"Ala", GCC:"Ala", GAU:"Asp", GAC:"Asp", GGU:"Gly", GGC:"Gly", GUA:"Val", GUG:"Val", GCA:"Ala", GCG:"Ala", GAA:"Glu", GAG:"Glu", GGA:"Gly", GGG:"Gly"} codonrfull = {UUU:"Phenylalanine", UUC:"Phenylalanine", UCU:"Serine", UCC:"Serine", UAU:"Tyrosine", UAC:"Tyrosine", UGU:"Cysteine", UGC:"Cysteine", UUA:"Leucine", UCA:"Serine", UAA:"Stop", UGA:"Stop", UUG:"Leucine", UCG:"Serine", UAG:"Stop", UGG:"Tryptophan", CUU:"Leucine", CUC:"Leucine", CCU:"Proline", CCC:"Proline", CAU:"Histidine", CAC:"Histidine", CGU:"Arginine", CGC:"Arginine", CUA:"Leucine", CUG:"Leucine", CCA:"Proline", CCG:"Proline", CAA:"Glutamine", CAG:"Glutamine", CGA:"Arginine", CGG:"Arginine", AUU:"Isoleucine", AUC:"Isoleucine", ACU:"Threonine", ACC:"Threonine", AAU:"Asparagine", AAC:"Asparagine", AGU:"Serine", AGC:"Serine", AUA:"Isoleucine", ACA:"Threonine", AAA:"Lysine", AGA:"Arginine", AUG:"Methionine", ACG:"Threonine", AAG:"Lysine", AGG:"Arginine", GUU:"Valine", GUC:"Valine", GCU:"Alanine", GCC:"Alanine", GAU:"Aspartic acid", GAC:"Aspartic acid", GGU:"Glycine", GGC:"Glycine", GUA:"Valine", GUG:"Valine", GCA:"Alanine", GCG:"Alanine", GAA:"Glutamic acid", GAG:"Glutamic acid", GGA:"Glycine", GGG:"Glycine"} iupac = {R:'AG', Y:'CT', S:'GC', W:'AT', K:'GT', M:'AC', A:'AA', C:'CC', G:'GG', T:'TT', B:'CGT', D:'AGT', H:'ACT', V:'ACG', N:'ACGT', '.':'-', '-':'-'} threeletter = {"ala":["alanine","A"] ,"arg":["arginine","R"], "asn": ["asparagine","N"], "asp":["aspartic acid","D"] , asparagine or aspartic acid asx B cysteine cys C glutamic acid glu E glutamine gln Q glutamine or glutamic acid glx Z glycine gly G histidine his H isoleucine ile I leucine leu L lysine lys K methionine met M phenylalanine phe F proline pro P serine ser S threonine thr T tryptophan trp W tyrosine tyr Y valine val V """ class config(object): debug=0 def __init__(self,configfile): assert os.path.exists(configfile),"""ERROR:Configuration file not found, varmd.cfg must exist in the same directory with the varmd.py""" self.configfile = open(configfile) self.inputfiles={} self.input_bam_files={} self.input_include_bed_files={} self.input_cnv_bed_files={} self.input_include_linkage_bed_files={} for self.item in self.configfile: if not self.item.strip(): continue else: self.tl=self.item.strip().split() if self.tl[0][0] != '#': if self.tl[0] == 'input_file': if not self.inputfiles.get(self.tl[1]): self.inputfiles[self.tl[1]] = {} self.inputfiles[self.tl[1]] = self.tl[2] else: self.inputfiles[self.tl[1]] = self.tl[2] elif self.tl[0]=='input_bam_file': if not self.input_bam_files.get(self.tl[1]): #self.input_bam_files[self.tl[1]]=[{self.tl[2]:self.tl[3]},] self.input_bam_files[self.tl[1]]={} self.input_bam_files[self.tl[1]][self.tl[2]]=self.tl[3] else: self.input_bam_files[self.tl[1]][self.tl[2]]=self.tl[3] elif self.tl[0]=='input_include_bed_file': if not self.input_include_bed_files.get(self.tl[1]): self.input_include_bed_files[self.tl[1]]=self.tl[2] else: raise Exception('There are multiple entries for include bed files,\ please consolidate your coordinates') elif self.tl[0]=='input_include_linkage_bed_file': if not self.input_include_linkage_bed_files.get(self.tl[1]): self.input_include_linkage_bed_files[self.tl[1]]=self.tl[2] else: raise Exception('There are multiple entries for include linkage\ bed files, please consolidate your coordinates') elif self.tl[0]=='input_cnv_bed_file': if not self.input_cnv_bed_files.get(self.tl[1]): self.input_cnv_bed_files[self.tl[1]]=self.tl[2] else: raise Exception('There are multiple entries for include linkage\ bed files, please consolidate your coordinates') elif self.tl[0]=='interval': self.interval=self.tl[1] elif self.tl[0]=='nodetype': self.nodetype=self.tl[1] elif self.tl[0]=='projects_path': self.projects_path=self.tl[1] elif self.tl[0]=='population_file': self.population_file=self.tl[1] elif self.tl[0]=='seattleseq_dir_path': self.seattleseq_dir_path = self.tl[1] elif self.tl[0]=='genesToExclude': self.genes_to_exclude_path=self.tl[1] elif self.tl[0]=='cdpred_cutoff': self.cdpred_cutoff=self.tl[1] elif self.tl[0]=='cdpred_cutoff': self.cdpred_cutoff=self.tl[1] elif self.tl[0]=='send_mail_to': self.send_mail_to=self.tl[1] elif self.tl[0]=='liftOver_path': self.liftOver_path=self.tl[1] elif self.tl[0]=='liftOver_chain_path': self.liftOver_chain_path=self.tl[1] elif self.tl[0]=='dbsnp_vcf_path': self.dbsnp_vcf_path = self.tl[1] elif self.tl[0]=='dbsnp_db_path': self.dbsnp_db_path = self.tl[1] elif self.tl[0]=='population_db_path': self.population_db_path = self.tl[1] elif self.tl[0]=='expression_snps_path': self.expression_snps_path = self.tl[1] elif self.tl[0]=='chrominfohg19_path': self.chrominfohg19_path = self.tl[1] elif self.tl[0]=='hg18fasta': self.hg18fasta = self.tl[1] elif self.tl[0]=='dbconnect': self.dbconnect = self.tl[1] else: if self.debug==1: print 'This configuration entry is not used',self.tl class varfile(object): def __init__(self, htl): """ input: file name logic: make sure all file names are in the same format, otherwise mistakes will occur with affected status function .fam_members .num_affected .affected """ #------------------------------------------------------------------------------- self.first_sample_index = 0 for x in htl: if x[-3:]=='.NA': self.first_sample_index = htl.index(x) break self.total_num_samples = (len(htl)-self.first_sample_index)/3 assert float(self.total_num_samples)%1 == 0.0 #------------------------------------------------------------------------------- def get_project_members(self, project_code): """ reads projects_data.txt file generated using data from labmatrix and creates a dictionary of project_members """ self.project_members={} if args.projects_metadata_file != '': projects_metadata_file = args.projects_metadata_file else: projects_metadata_file = mycfg.projects_path prj_lines = open(projects_metadata_file).readlines() prj_htl = prj_lines[0].strip().split('\t') for line in prj_lines[1:]: if not line.startswith('#'): tl = line.strip().split('\t') if tl[prj_htl.index('Fam_id')] == project_code: self.project_members[tl[prj_htl.index('subject_code')]] = {} self.project_members[tl[prj_htl.index('subject_code')]]['mom'] = tl[prj_htl.index('mom')] self.project_members[tl[prj_htl.index('subject_code')]]['dad'] = tl[prj_htl.index('dad')] self.project_members[tl[prj_htl.index('subject_code')]]['Fam_id'] = tl[prj_htl.index('Fam_id')] self.project_members[tl[prj_htl.index('subject_code')]]['sex'] = tl[prj_htl.index('sex')] self.project_members[tl[prj_htl.index('subject_code')]]['affected'] = tl[prj_htl.index('affected')] self.project_members[tl[prj_htl.index('subject_code')]]['alive'] = tl[prj_htl.index('alive')] self.project_members[tl[prj_htl.index('subject_code')]]['founder'] = tl[prj_htl.index('founder_genotype_in_pedigree')] try: self.project_members[tl[prj_htl.index('subject_code')]]['birth_year'] = tl[prj_htl.index('birth_year')] except IndexError: self.project_members[tl[prj_htl.index('subject_code')]]['birth_year'] = "NA" try: self.project_members[tl[prj_htl.index('subject_code')]]['date_sent_to_nisc'] = tl[prj_htl.index('Date_sent_to_NISC')] except IndexError: self.project_members[tl[prj_htl.index('subject_code')]]['date_sent_to_nisc'] = "NA" if self.project_members == {}: print 'NOTICE: Project_code = %s :No data was found in the projects_data file get_fam_members()'%project_code if not project_code: print "No project code was given" return self.project_members def get_num_affected(self, project_code): self.num_affected=0 self.project_members = self.get_project_members(project_code) for member,data in self.project_members.iteritems(): assert data['affected'] in ['1','0'], """The affected status is not 0\ or 1, the value found for affected status is >>" %s "<<, Please correct\ the value in %s"""%(data['affected'], projects_metadata_path) if data['affected'] == '1': self.num_affected += 1 return self.num_affected def downdb(): """ downloads UCSC annotations files """ if args.downdb.lower()=='knowngene': dbname = 'knownGene' sys.stderr.write('using %s to specify use --buildver \n'%args.buildver) command = 'wget ftp://hgdownload.cse.ucsc.edu/goldenPath/%s/database/%s.txt.gz'%(args.buildver, dbname) subprocess.call(shlex.split(command)) command = 'mv %s.txt.gz %s/'%(dbname, args.dbdir) subprocess.call(shlex.split(command)) print 'file downloaded to %s'%args.dbdir def sendmail(): enddate=time.ctime() endall=time.time() pathname, scriptname = os.path.split(sys.argv[0]) Msg='Script name:%s \nFunction name: %s\n\narguments:%s\n\n'\ 'finished execution, please check output.\n'%(scriptname, args.function_name, args) p = os.popen("/usr/sbin/sendmail -t", "w") p.write("From: sincanm@mail.nih.gov\n") p.write("To: %s\n"%mycfg.send_mail_to) p.write("Subject: program execution status\n") p.write("Start date:%s\nEnd date:%s\n"%(startdate,enddate)) p.write("Elapsed time is %.3s min\n"%((endall-startall)/60)) p.write(Msg) p.close() def usage(): print """This program takes function names and other parameters to run. --func available function names are usage split_by_chrs, prep_ss, annotate_with, annotate_with_allfams, annotate_with_ss_snv, annotate_with_ss_indel, scandirs_cdpred, mendelian_filters, mendelian_filters_com_het_res, annotate_with_mendelian_filters, reduce_annotate_with, create_allsamples_db, update_allfams, update_local_freq_file, dbsnp_vcf2db, prepare_liftover_input, add_hg19, reduce_addhg19, run_analysis run_analysis_single_file ) --downdb [knowngene,] --buildver (default: hg19) Please contact sincanm@mail.nih.gov for instructions. """ def coords_19(interval): """Returns a list with each item containing [chr,start,stop positions] based on the interval given""" test=0 chrinfolist=[] for line in open(mycfg.chrominfohg19_path): tl=line.strip().split('\t') if tl[0] in chrs: chrinfolist.append(tl[:2]) if test: print '*** chrinfolist',chrinfolist if type(interval)==str: interval=int(interval) global coords19 coords19=[] for item in chrinfolist: coords19.append([item[0],0,interval]) while int(item[1])>coords19[-1][2]: coords19.append([item[0],int(coords19[-1][2]+1),int(coords19[-1][2])+interval]) if test: print '***coords19',coords19 return coords19 def write_lines(path1,linelist): file=open(path1,"w") file.writelines(linelist) file.close() def get_ptcode(headerline,p_pos): """ Reads the header line that is provided in the function call and return the family codes based on the position provided in the initial variants for positions """ test=0 global pt_code pt_code="" tl=headerline.strip().split("\t") try: if tl[p_pos][-3:]==".NA": pt_code=tl[p_pos][:-3] elif tl[p_pos][-3:]!=".NA": pt_code=tl[p_pos] except Exception: print ('headerline in get genotypes()',headerline) print ('tl without[p_pos] at get_ptcode',tl) print ('p_pos in get_ptcode()',p_pos) if test: print "tl{p_pos] is %s"%tl[p_pos] print "tl{p_pos][:-3] is %s"%tl[p_pos][:-3] print "pt_code is %s"%pt_code return pt_code def p_ids(): """ input:none, uses projects metadata file (argument or configuration file entry) output: patient_ids(old) and family_data(new, udp code as key and all the rest of family data file info as values """ test=0 #redundant global variable decleratiosn as the return values are assigned to variables now. #cruft delete from code #global patient_ids #global family_data patient_ids={} family_data={} try: projects_lines = open(mycfg.projects_path).readlines() htl = projects_lines[0].strip().split("\t") #index Fam_id subject_code sex affected mom dad alive birth_year date_sent_to_NISC for line in projects_lines[1:]: if test: print line time.sleep(1) if not line.startswith('#'): tl=line.strip().split('\t') patient_ids[tl[htl.index("subject_code")]]=tl[htl.index("index")] family_data[tl[htl.index("subject_code")]]=tl[htl.index("sex"):] except Exception: print "%s could not be found"%mycfg.projects_path sys.exit(1) if test: print 'patient ids:',patient_ids, 'len patient ids:',len(patient_ids) print '\n\nfamily data:',family_data time.sleep(1) return patient_ids, family_data def mpg_scores( tl, htl): """ input: a varsifter line, the file the line comes from initial score is 1 each mpg/coverage for an individual is evaluated according to the formula for a penalty of :(1/number of individuals in the family) 1-penalty is the final score if the mpg score or the coverage is < 10 there is a penalty if the coverage is between 10-20 we require the mpg to be at least half of the coverage if the coverage is more than 20 than the mpg can be as low as 20% of the coverage returns a float between 0 and 1.0 , the less the number the worse the mpg_score evaluation is """ test= False if args.nextgene_data: cutoff= 7 else:cutoff=10.0 if args.nextgene_data: validcutoff = 14.0 else: validcutoff=20.0 if args.nextgene_data: badmpgratiocutoff = 0.2 else: badmpgratiocutoff = 0.5 if args.nextgene_data: goodmpgratiocutoff = 0.1 else: goodmpgratiocutoff=0.15 final_mpg_score = Fraction(1,1) myfileobj = varfile(htl) pos = myfileobj.first_sample_index num_pts=(len(tl)-(pos))/3 #print 'pos returned by myfileobj.first_sample_index is =>',pos #print 'data at that position and -3:10 ',tl[pos-3:pos+10] #print 'num_pts',num_pts #time.sleep(100) for mem in xrange(num_pts): mpgpos = pos+((mem*3)+1) coveragepos = pos+((mem*3)+2) try: float(tl[mpgpos].replace('NA','0')) except ValueError, TypeError: sys.stderr.write('mpgpos =%s, tl[mpgpos]=%s \n'%(mpgpos,tl[mpgpos])) sys.stderr.write('mpg column doesn\'t contain numeric data ay lineindex %s\n'%tl[htl.index('Index')]) sys.stderr.write('Exiting program, please correct the data\n') raise Exception if (not tl[coveragepos].isdigit()) and tl[coveragepos] != "NA": sys.stderr.write("coverage column doesn't contain numeric data\n") sys.stderr.write("coverage pos = %s\n"%coveragepos) sys.stderr.write("tl[coveragepos] = %s\n"%tl[coveragepos]) sys.exit() mpg_score = float(tl[mpgpos].replace("NA","0")) coverage = float(tl[coveragepos].replace("NA","0")) if test: print 'mpg_score',mpg_score print 'coverage',coverage if (mpg_score <= cutoff) or (coverage <= cutoff): final_mpg_score -= Fraction(1, num_pts) elif cutoff < mpg_score < validcutoff: if (mpg_score /coverage) <= badmpgratiocutoff: #print 'after else',tl[pos+1],tl[pos+2] #print 'score is bad',line #print '1',final_mpg_score final_mpg_score -= Fraction(1, num_pts) #print '2',final_mpg_score elif ( mpg_score / coverage) > badmpgratiocutoff: #print 'after 2.if',tl[pos+1],tl[pos+2] pass #print 'score is good',line #time.sleep(1) elif mpg_score >= validcutoff: pass #if (mpg_score /coverage) <= goodmpgratiocutoff: #print 'after else',tl[pos+1],tl[pos+2] #print 'score is bad',line #print '1',final_mpg_score #final_mpg_score -= Fraction(1, num_pts) #print '2',final_mpg_score #elif ( mpg_score / coverage) > goodmpgratiocutoff: #print 'after 2.if',tl[pos+1],tl[pos+2] #pass #print 'score is good',line #time.sleep(1) else: raise Exception #print 'mpg score value calculated in the final_mpg_scores function is =>',final_mpg_score #print 'sleeping 2 secs' #time.sleep(2) if not (0 <= float(final_mpg_score) <= 1.0): print 'final_mpg_score is not within defined limits' print 'final_mpg_score = %s'%float(final_mpg_score) print 'number of total samples is %s'%num_pts print 'line is', line raise Exception return float(final_mpg_score) def get_genotypes(tl1,htl,myaffected, first_sample_index, family_data): """ This function gets a line and a headerline as input, splits it by tab and determines the genotype of individuals returns a dictionary with UDP codes as keys and pt raw genotypes and pt calculated genotypes as values gts [0]= patient genotype with two alleles gts [1]= patient calculated genotype class (hom_ref, hom_var, het_complete, var_incoplete, ref_incomplete, novel_mutation, ref_novel_het, var_novel_het, NA ) gts [2]= mpg score gts [3]= coverage #the rest comes from family_data returned by p_ids() gts [4]= sex gts [5]= affected gts [6]= mom udpcode gts [7]= dad udpcode gts[8]=alive[0,1] gts[9]= founder_in_pedigree gts[10]= birth year gts[11]= date sent to NISC """ test=0 if test: print 'tl1',tl1 print 'htl',htl print 'myaffected',myaffected print 'first_sample_index',first_sample_index print 'this is tl [first_sample_index-5:first_sample_index+5]',tl1[int(first_sample_index)-5:int(first_sample_index)+5] print 'tl[first_sample_index-1] ',tl1[first_sample_index-1] print 'tl[first_sample_index] ',tl1[first_sample_index] print 'tl[first_sample_index+1] ',tl1[first_sample_index+1] #for x in htl: # print htl.index(x), x, tl1[htl.index(x)] #print 'family_data',family_data gts={} if len(tl1) != len(htl): sys.stderr.write('tl != htl len(tl)= %s len(htl)=%s\n'%(len(tl1), len(htl))) sys.stderr.write('tl= %s\n'%tl1) sys.stderr.write('htl= %s\n'%htl) raise Exception pos = first_sample_index if test: print 'this is pos as a result of using myvarfile.first_sample_index with htl ',pos print 'this is tl [pos-5:pos+5]',tl1[int(pos)-5:int(pos)+5] print 'This is the item at tl[pos]',tl1[pos] print '\nline in get_genotypoes() is ',tl1 print 'patient data in line in get_genotypes_ is ', tl1[pos-1:] print 'number of patients determined by get_genotypes_() is ', len(myaffected) time.sleep(1) for key in myaffected.keys(): if test: print 'key in myaffected before pop', key if not '%s.NA'%key in htl: print 'popping key %s'%key print htl[40:] if test: 'key %s popped from final list'%key myaffected.pop(key) if test: print 'len myaffected', len(myaffected) for k,v in myaffected.iteritems(): print 'k,v in myaffected',k,v for sample_code,v in myaffected.iteritems(): if test: print 'sample_code, v',sample_code, v #print 'in get_genotypes_ function: pt_code is => %s'%sample_code #time.sleep(1) #print "data dictionary = 1:hom ref,2:hom var, 3:het complete,\ #4:ref incomplete,5:var incompelete, 6:single novel mutation,\ #7:ref novel het, 8:var novel het , 9:hom_novel" gt_class="" if len(tl1[htl.index('%s.NA'%sample_code)])<2: if test: print "found incomplete genotype",tl1 print tl1[htl.index('%s.NA'%sample_code)][allele] #if tl1[htl.index('%s.NA'%sample_code)]==tl1[myfilespecs["var_allele"]]: if tl1[htl.index('%s.NA'%sample_code)] == tl1[htl.index('var_allele')]: gt_class="var_incomplete" #elif tl1[htl.index('%s.NA'%sample_code)]==tl1[myfilespecs["ref_allele"]]: elif tl1[htl.index('%s.NA'%sample_code)] == tl1[htl.index('ref_allele')]: gt_class="ref_incomplete" elif (tl1[htl.index('%s.NA'%sample_code)] != tl1[htl.index("ref_allele")] and tl1[htl.index('%s.NA'%sample_code)] != tl1[htl.index("var_allele")]): gt_class="novel_mutation" else: print "something wrong check the row",tl1 #time.sleep(1) elif len(tl1[htl.index('%s.NA'%sample_code)])==2: varcount=0 refcount=0 novelcount=0 if tl1[htl.index('%s.NA'%sample_code)]=="NA": gt_class="NA" else: #print "this is a valid genotype =>",tl1[htl.index('%s.NA'%sample_code)] #print tl1[htl.index('%s.NA'%sample_code)] #print tl1[htl.index('var_allele']] for allele in xrange(2): #print tl1[htl.index('%s.NA'%sample_code)][allele] if tl1[htl.index('%s.NA'%sample_code)][allele]==tl1[htl.index("var_allele")]: varcount+=1 elif tl1[htl.index('%s.NA'%sample_code)][allele]==tl1[htl.index("ref_allele")]: refcount+=1 elif (tl1[htl.index('%s.NA'%sample_code)][allele]!=tl1[htl.index("ref_allele")] and tl1[htl.index('%s.NA'%sample_code)][allele]!=tl1[htl.index("var_allele")]): novelcount+=1 #a=freq[tl1[htl.index('%s.NA'%sample_code)][allele]] #print a[0],a[1] #freql.append(a[1]+1) #freq[tl1[htl.index('%s.NA'%sample_code)][allele]]=[gt_id[0][1],a[1]+1] #print "varcount",varcount #print "ref count",refcount if varcount==2: gt_class="hom_var" elif refcount==2: gt_class="hom_ref" elif novelcount==2: gt_class="hom_novel" elif refcount==1 and varcount==1: gt_class="het_complete" elif refcount==1 and novelcount==1: gt_class="ref_novel_het" elif varcount==1 and novelcount==1: gt_class="var_novel_het" else: print "something wrong check the row",tl1[a:a+3] #print 'line is', line #print 'calculated genotype class is',gt_class #time.sleep(1) elif len(tl1[htl.index('%s.NA'%sample_code)])>2: if ":" in tl1[htl.index('%s.NA'%sample_code)]: varcount=0 refcount=0 novelcount=0 #print 'found a double colon seperated one',tl1 for allele in xrange(2): #print "allele = %s "%allele allele_gt="" if allele==0: allele_gt=tl1[htl.index('%s.NA'%sample_code)][:(tl1[htl.index('%s.NA'%sample_code)].index(":"))] elif allele==1: allele_gt=tl1[htl.index('%s.NA'%sample_code)][(tl1[htl.index('%s.NA'%sample_code)].index(":"))+1:] else: print "allele gt could not be determined" print "allele_gt is:",allele_gt #print tl1[htl.index('%s.NA'%sample_code)][allele] if allele_gt==tl1[htl.index("ref_allele")]: refcount+=1 elif allele_gt==tl1[htl.index("var_allele")]: varcount+=1 elif allele_gt!=tl1[htl.index("ref_allele")] and allele_gt!=tl1[htl.index("var_allele")]: novelcount+=1 else: print "something wrong with determining the genotype counts , check the row",tl1[a:a+3] if varcount==2: gt_class="hom_var" elif refcount==2: gt_class="hom_ref" elif novelcount==2: gt_class="hom_novel" elif refcount==1 and varcount==1: gt_class="het_complete" elif refcount==1 and novelcount==1: gt_class="ref_novel_het" elif varcount==1 and novelcount==1: gt_class="var_novel_het" else: print "ref gt=",tl1[htl.index("ref_allele")],len(tl1[htl.index("ref_allele")]) print "var gt=",tl1[htl.index("var_allele")],len(tl1[htl.index("var_allele")]) print "patient gt=",tl1[htl.index('%s.NA'%sample_code)],tl1[htl.index('%s.NA'%sample_code)][:(tl1[htl.index('%s.NA'%sample_code)].index(":"))],len(tl1[htl.index('%s.NA'%sample_code)][:(tl1[htl.index('%s.NA'%sample_code)].index(":"))]),tl1[htl.index('%s.NA'%sample_code)][(tl1[htl.index('%s.NA'%sample_code)].index(":"))+1:],len(tl1[htl.index('%s.NA'%sample_code)][(tl1[htl.index('%s.NA'%sample_code)].index(":"))+1:]) print "rc",refcount,"vc",varcount,"nc",novelcount print "something wrong determining gt class variables, check the row",tl1[a:a+3] #print ">2 freq1",freq1 #time.sleep(1) else: if tl1[htl.index('%s.NA'%sample_code)]==tl1[htl.index("var_allele")]: gt_class="hom_var" elif tl1[htl.index('%s.NA'%sample_code)]==tl1[htl.index("ref_allele")]: gt_class="hom_ref" elif tl1[htl.index('%s.NA'%sample_code)]!=tl1[htl.index("ref_allele")] and tl1[htl.index('%s.NA'%sample_code)]!=tl1[htl.index("var_allele")]: gt_class="het_complete" else: print "something wrong check the row",tl1[a:a+3],tl1[htl.index('%s.NA'%sample_code)] else: print 'row is not 2 or more characters long check row',tl1 #read family data table in to a dictionary #patient_ids, family_data = p_ids() #print 'pt_code in get genotypes() function is =>',sample_code gts[sample_code] = [tl1[htl.index('%s.NA'%sample_code)], gt_class, tl1[htl.index('%s.NA'%sample_code)+1], tl1[htl.index('%s.NA'%sample_code)+2]] + family_data[sample_code] """ print 'could not get value into gts dictionary, family data may be'\ 'inconsistent with the variant report file make sure the patient'\ 'codes match between the two files' print 'Family data returned by p_ids() function as family_data =>',family_data print '\npt_code in get genotypes() is %s \n'%sample_code print 'tl1:',tl1 print 'gts dictionary is as follows' for k,v in gts.items(): print k,'==>',v raise Exception #print 'counter is %s'%counter #time.sleep(.5) """ #test code for the fucntion if test: print 'gts.iteritems()' for k,v in gts.iteritems(): print k,v if len(gts)!= len(myaffected) or gts == {}: print "Error in get_genotypes()" print "number of subjects is %s"%len(myaffected) print 'myaffected=',myaffected print "number of items in gts is %s"%len(gts) print 'gts:',gts print tl1 raise Exception for k,v in gts.items(): #print 'genotype dict entry is k =%s v=>%s'%(k,v) #time.sleep(1) assert len(v)==12,'genotypes dictionary was not constructed correctly, please\ check the results k=%s, v=%s v is %s items long'%(k,v,len(v)) #time.sleep(1) #print 'testcode gts=',gts #print tl1 #time.sleep(30) return gts class varline(object): """ patient calculated zygosity state (hom_ref, hom_var, het_complete, var_incoplete, ref_incomplete, novel_mutation, ref_novel_het, var_novel_het, NA ) """ ref_gt_count = 0 var_gt_count = 0 novel_gt_count = 0 ref_allele_count = 0 var_allele_count = 0 novel_allele_count = 0 def __init__(self,tl,htl,line=None): self._gt_class = '' self.pos = None self.htl = htl self.tl = tl all_genotypes = [] self.all_genotypes_are_same = False for x in htl: if x.endswith('.NA'): self.pos = htl.index(x) break self.mpg_coverage_score = mpg_scores(tl, htl) assert isinstance(self.mpg_coverage_score, float) for self.member in range(self.pos,len(self.tl),3): #print 'member',self.member, 'len (line)',len(self.tl),'int (pos)',int(self.pos) #get_ptcode(hline,pos) #gt_index.append(pt_code) ###for self.value in self.tl[self._a:self._a+3]: #self.gt_index=[] self.gt_index = self.tl[self.member:self.member+3] all_genotypes.append(self.gt_index[0]) if len(self.gt_index[0])<2: #print "found incomplete genotype in ",self.tl if self.gt_index[0]==self.tl[self.htl.index('ref_allele')]: self._gt_class='ref_incomplete' self.ref_allele_count+=1 elif self.gt_index[0]==self.tl[self.htl.index('var_allele')]: self._gt_class='var_incomplete' self.var_allele_count+=1 elif self.gt_index[0]!=self.tl[self.htl.index('var_allele')] and self.gt_index[0]!=self.tl[self.htl.index('var_allele')]: self._gt_class='novel_incomplete' self.novel_allele_count+=1 else: print "something wrong check the row",il1[_a:_a+3] elif len(self.gt_index[0])==2: self.refcount=0 self.varcount=0 self.novelcount=0 if self.gt_index[0]=="NA": #print "found NA",il1 pass else: for self.allele in xrange(2): if self.gt_index[0][self.allele]==self.tl[self.htl.index('var_allele')]: self.varcount+=1 elif self.gt_index[0][self.allele]==self.tl[self.htl.index('ref_allele')]: self.refcount+=1 elif self.gt_index[0][self.allele]!=self.tl[self.htl.index('ref_allele')] and self.gt_index[0][self.allele]!=self.tl[self.htl.index('var_allele')]: self.novelcount+=1 if self.varcount==2: self._gt_class='hom_var' self.var_gt_count+=1 self.var_allele_count+=2 elif self.refcount==2: self._gt_class='hom_ref' self.ref_gt_count+=1 self.ref_allele_count+=2 elif self.novelcount==2: self._gt_class='hom_novel' self.novel_gt_count+=1 self.novel_allele_count+=2 elif self.refcount==1 and self.varcount==1: self._gt_class='het_complete' self.ref_allele_count+=1 self.var_allele_count+=1 elif self.refcount==1 and self.novelcount==1: self._gt_class='ref_novel_het' self.ref_allele_count+=1 self.novel_allele_count+=1 elif self.varcount==1 and self.novelcount==1: self._gt_class='var_novel_het' self.var_allele_count+=1 self.novel_allele_count+=1 else: print "something wrong check the row",il1[_a:_a+3] #print 'self gt index',self.gt_index #print 'self ref count',self.refcount #print 'self var count',self.varcount #print 'self novel count',self.novelcount #print 'self gt class',self._gt_class elif len(self.gt_index[0])>2: if ":" in self.gt_index[0]: self.varcount=0 self.refcount=0 self.novelcount=0 for self.allele in xrange(2): #print "allele = %s "%allele self.allele_gt = "" if self.allele == 0: self.allele_gt = self.gt_index[0][:(self.gt_index[0].index(":"))] elif self.allele == 1: self.allele_gt = self.gt_index[0][(self.gt_index[0].index(":"))+1:] else: print "allele gt could not be determined" print "allele_gt is:",self.allele_gt #test code #print self.gt_index[0], self.allele_gt #time.sleep(1) if self.allele_gt == self.tl[self.htl.index('var_allele')]: self.varcount+=1 elif self.allele_gt == self.tl[self.htl.index('ref_allele')]: self.refcount+=1 elif self.allele_gt != self.tl[self.htl.index('ref_allele')] and self.allele_gt != self.tl[self.htl.index('var_allele')]: self.novelcount+=1 else: print "something wrong with determining the genotype counts , check the row",il1[_a:_a+3] if self.varcount == 2: self._gt_class = 'hom_var' self.var_gt_count += 1 self.var_allele_count += 2 elif self.refcount == 2: self._gt_class = 'hom_ref' self.ref_gt_count += 1 self.ref_allele_count += 2 elif self.novelcount == 2: self._gt_class='hom_novel' self.novel_gt_count+=1 self.novel_allele_count+=2 elif self.refcount==1 and self.varcount==1: self._gt_class='het_complete' self.ref_allele_count+=1 self.var_allele_count+=1 elif self.refcount==1 and self.novelcount==1: self._gt_class='ref_novel_het' self.ref_allele_count+=1 self.novel_allele_count+=1 elif self.varcount==1 and self.novelcount==1: self._gt_class='var_novel_het' self.var_allele_count+=1 self.novel_allele_count+=1 else: print "something wrong check the row",il1[_a:_a+3] self.gt_index.append(self._gt_class) if len(set(all_genotypes)) == 1: self.all_genotypes_are_same = True elif len(set(all_genotypes)) > 1: self.all_genotypes_are_same = False else: raise Exception def is_hom_rec(self,patient,line,varfile): #work on this implement this print 'line',line print 'varfile',varfile print patient def scandirs_cdpred(inputfile, outputfile): print "file: processing file: ",inputfile myfile = open(inputfile) hline = myfile.readline() htl = hline.strip().split('\t') muttype_present = False if 'muttype' in hline: muttype_present = True cdneglines=[] cdneglines.append(hline) linecounter=0 snv_counter=0 indel_counter=0 for line in myfile: linecounter+=1 tl=line.strip().split('\t') try: #print 'cdpred score',tl[myfilespecs['CDPred_score']] #print 'cdpred score type',type(tl[myfilespecs['CDPred_score']]) #print 'cutoff',cutoff #print 'cutoff type',type(cutoff) #time.sleep(.2) if 'polyPhen' in hline and ('CDPred_score' in hline): if tl[htl.index('CDPred_score')] =='-': cdneglines.append(line) elif ( int(tl[htl.index('CDPred_score')] ) < int(mycfg.cdpred_cutoff) or 'damaging' in tl[htl.index('ss_polyPhen')]): cdneglines.append(line) else: print 'CdPred is not a number or "-"' raise Exception elif 'polyPhen' in hline and (not 'CDPred_score' in hline): #if 'damaging' in tl[htl.index('ss_polyPhen')]: cdneglines.append(line) else: raise Exception if muttype_present: if tl[htl.index('muttype')]=="SNP": snv_counter+=1 elif tl[htl.index('muttype')]=="INDEL": indel_counter+=1 except Exception: print """There is a problem with CDPred score or with PolyPhen score either the scores are missing or the format is not correct. Please check the input files""" print 'len cdneglines=',len(cdneglines) write_lines(outputfile, cdneglines) if linecounter != len(cdneglines): print 'there were %s lines in the file, only %s of those were included in the cdneg file'%(linecounter,len(cdneglines)) tmpdir=os.path.dirname(inputfile) fstats=open("%s/analysis_stats.txt"%tmpdir,"a") fstats.write("Number of total variation lines in %s file is:\t%s"%(os.path.basename(inputfile),linecounter) +"\n" ) if muttype_present: fstats.write("Number of SNVs in %s file is:\t%s"%(os.path.basename(inputfile),snv_counter) +"\n" ) fstats.write("Number of INDELs in %s file is:\t%s"%(os.path.basename(inputfile),indel_counter) +"\n" ) fstats.write("Total number of cd-pred negative lines are:\t%s\n"%(len(cdneglines)-1)) #end of stats class db_class: def create_tables(self): vars_table = Table( 'final_variants',metadata, Column('myindex', Integer, primary_key=True), Column('project_code',String(50),nullable=False), Column('mend_type',String(50),nullable=False), Column('Chr',String(50),nullable=False), Column('hg18LeftFlank',String(50),nullable=False), Column('hg18RightFlank',String(50),nullable=False), Column('hg19LeftFlank',String(50),nullable=False), Column('hg19RightFlank',String(50),nullable=False), Column('ref_allele',String(200),nullable=False), Column('var_allele',String(200),nullable=False), useexisting=True, ) Index('idx_clr1', vars_table.c.mend_type, vars_table.c.Chr, vars_table.c.hg19LeftFlank, vars_table.c.hg19RightFlank, vars_table.c.var_allele) compound_het_vars_table= Table( 'final_compound_het_variants',metadata, Column('myindex',Integer,primary_key=True), Column('project_code',String(50),nullable=False), Column('mend_type',String(50),nullable=False), Column('Chr1',String(50),nullable=False), Column('hg18LeftFlank1',String(50),nullable=False), Column('hg18RightFlank1',String(50),nullable=False), Column('hg19LeftFlank1',String(50),nullable=False), Column('hg19RightFlank1',String(50),nullable=False), Column('ref_allele1',String(200),nullable=False), Column('var_allele1',String(200),nullable=False), Column('Chr2',String(50),nullable=False), Column('hg18LeftFlank2',String(50),nullable=False), Column('hg18RightFlank2',String(50),nullable=False), Column('hg19LeftFlank2',String(50),nullable=False), Column('hg19RightFlank2',String(50),nullable=False), Column('ref_allele2',String(200),nullable=False), Column('var_allele2',String(200),nullable=False), useexisting=True, ) Index('idx_clr2', compound_het_vars_table.c.mend_type, compound_het_vars_table.c.Chr1, compound_het_vars_table.c.hg18LeftFlank1, compound_het_vars_table.c.hg18RightFlank1, ) genotypes_table= Table( 'genotypes', metadata, Column('myindex',None,ForeignKey('final_variants.myindex')), Column('genotype',String(200),nullable=False), Column('genotype_score',String(50),nullable=False), Column('coverage',String(50),nullable=False), useexisting=True, ) metadata.create_all() def check_and_add_db(self, htl=None, variantline1=None, variantline2=None, project_code=None, mendtype=None): """ varfile is the file the variant line is extracted from ,used for filespecs dictionary creation variant line is the list of fileds generated by mendelian inheritance analysis from the line in question mendtype is the type of mendelian inheritence performed to create the line, project code is the deidentified code for sample studied the format of the line is varsifter like """ # if this creates an error due to db tables not being created , call self.create_tables() in a try , except clause vars_table = Table('final_variants', metadata, autoload=True) compound_het_vars_table = Table('final_compound_het_variants',metadata, autoload=True) genotypes_table = Table('genotypes', metadata, autoload=True) # if this creates an error due to db tables not being created , call self.create_tables() in a try , except clause test=0 verbose=1 if test: print 'test code in db_class' print 'variantline=',variantline print 'project_code',project_code print 'mendtype',mendtype tl1=variantline1 tl2=variantline2 if mendtype in ('autozomal_rec','autozomal_dom','xlinked_rec'): s = vars_table.select(and_ (vars_table.c.mend_type == mendtype, vars_table.c.Chr == tl1[htl.index('Chr')], vars_table.c.hg18LeftFlank == tl1[htl.index('LeftFlank')], vars_table.c.hg18RightFlank == tl1[htl.index('RightFlank')], vars_table.c.var_allele == tl1[htl.index('var_allele')] )) test=0 if test: print mendtype print tl1[htl.index('Chr')] print tl1[htl.index('LeftFlank')] print tl1[htl.index('RightFlank')] print tl1[htl.index('var_allele')] raise Exception rs = s.execute() rows = rs.fetchall() rs.close() #print 'sqlalchemy:test: resultset rows is %s items'%len(rows) foundsameflag=False founddifferentflag=False foundsamerow=None founddifferentrow=[] if rows: for row in rows: if not row[3].startswith('chr'): print "row[3]",row[3] print "row",row raise Exception if (row[1],row[2],row[3],row[4],row[5],row[9]) == (project_code, mendtype, tl1[htl.index('Chr')], tl1[htl.index('LeftFlank')], tl1[htl.index('RightFlank')], tl1[htl.index('var_allele')], ): foundsameflag=True foundsamerow=row elif (row[3],row[4],row[5],row[6],row[9]) == ( mendtype, tl1[htl.index('Chr')], tl1[htl.index('LeftFlank')], tl1[htl.index('RightFlank')], tl1[htl.index('var_allele')], ) and row[1]!= project_code: founddifferentflag=True founddifferentrow.append(row) if foundsameflag == True: pass #print 'variant already in the database,skipping' elif foundsameflag == False: ins = vars_table.insert() ins.execute( myindex=None, project_code = project_code, mend_type = mendtype, Chr = tl1[htl.index('Chr')].lower(), hg18LeftFlank = tl1[htl.index('LeftFlank')], hg18RightFlank = tl1[htl.index('RightFlank')], hg19LeftFlank = tl1[htl.index('hg19LeftFlank')], hg19RightFlank = tl1[htl.index('hg19RightFlank')], ref_allele = tl1[htl.index('ref_allele')], var_allele=tl1[htl.index('var_allele')], ) #print 'not found in the database, added new record' else: raise Exception return {'foundsameflag':foundsameflag,'founddifferentflag':founddifferentflag, 'foundsamerow':foundsamerow,'founddifferentrow':founddifferentrow} elif mendtype == "compound_heterozygote": if 'LeftFlank' in htl: s = compound_het_vars_table.select(and_ ( compound_het_vars_table.c.project_code == project_code, compound_het_vars_table.c.Chr1 == tl1[htl.index('Chr')], compound_het_vars_table.c.hg18LeftFlank1 == tl1[htl.index('LeftFlank')], compound_het_vars_table.c.hg18RightFlank1 == tl1[htl.index('RightFlank')], )) rs = s.execute() rows = rs.fetchall() rs.close() #print 'resultset of compound het rows is %s items'%len(rows) foundsameflag=False founddifferentflag=False foundsamerow=None founddifferentrow=[] if rows: for row in rows: if ((row[1],row[2], row[3],row[4],row[5],row[9], row[10],row[11],row[12],row[16]) == (project_code, mendtype, tl1[htl.index('Chr')],tl1[htl.index('LeftFlank')],tl1[htl.index('RightFlank')],tl1[htl.index('var_allele')], tl2[htl.index('Chr')],tl2[htl.index('LeftFlank')],tl2[htl.index('RightFlank')],tl2[htl.index('var_allele')], )): foundsameflag = True foundsamerow = row elif ((row[2], row[3],row[4],row[5],row[9], row[10],row[11],row[12],row[16]) == (mendtype, tl1[htl.index('Chr')],tl1[htl.index('LeftFlank')],tl1[htl.index('RightFlank')],tl1[htl.index('var_allele')], tl2[htl.index('Chr')],tl2[htl.index('LeftFlank')],tl2[htl.index('RightFlank')],tl2[htl.index('var_allele')], ) and row[1] != project_code): founddifferentflag = True founddifferentrow.append(row) if foundsameflag == True: pass #print 'variant already in the database,skipping' elif foundsameflag == False: ins = compound_het_vars_table.insert() ins.execute( myindex = None, project_code = project_code, mend_type = mendtype, Chr1 = tl1[htl.index('Chr')].lower(), hg18LeftFlank1 = tl1[htl.index('LeftFlank')], hg18RightFlank1 = tl1[htl.index('RightFlank')], hg19LeftFlank1 = tl1[htl.index('hg19LeftFlank')], hg19RightFlank1 = tl1[htl.index('hg19RightFlank')], ref_allele1=tl1[htl.index('ref_allele')], var_allele1=tl1[htl.index('var_allele')], Chr2=tl2[htl.index('Chr')], hg18LeftFlank2=tl2[htl.index('LeftFlank')], hg18RightFlank2=tl2[htl.index('RightFlank')], hg19LeftFlank2=tl2[htl.index('hg19LeftFlank')], hg19RightFlank2=tl2[htl.index('hg19RightFlank')], ref_allele2=tl2[htl.index('ref_allele')], var_allele2=tl2[htl.index('var_allele')], ) #print 'not found in the database, added compound het pairs as new record' else: pass else: print "Mendelian type was not determined" print "mendtype = %s"%mendtype raise Exception return {'foundsameflag':foundsameflag,'founddifferentflag':founddifferentflag, 'foundsamerow':foundsamerow,'founddifferentrow':founddifferentrow} def penalty_score(tl,htl,cdnegfile,inheritance_model, project_code): penaltyMAF=0.0 penaltyG5=0.0 penalty_linkage=0.0 penaltyHomozygousVarGT=0.0 penaltyVarAlleleFrequency=0.0 penaltyVarAlleleFrequency1=0.0 population_var_allele_count = 0.0 if 'Single_VarAllele_UDP_count' in htl: if tl[htl.index('Single_VarAllele_UDP_count')] == '-': pass else: population_var_allele_count += float(tl[htl.index('Single_VarAllele_UDP_count')]) if 'CS_572c_varallele' in htl: if tl[htl.index('CS_572c_varallele')] == '-': pass else: population_var_allele_count += float(tl[htl.index('CS_572c_varallele')]) myvarfile = varfile(htl) myvarline = varline(tl,htl) num_affected = myvarfile.get_num_affected(project_code) total_num_samples = myvarfile.total_num_samples if tl[htl.index('dbsnp137GMAF')] == '-' or tl[htl.index('dbsnp137GMAF')] == '': pass else: penaltyMAF = float(tl[htl.index('dbsnp137GMAF')]) * 100 #------------------------------------------------------------------------------- if tl[htl.index('dbsnp137G5')] == '+': penaltyG5 = 50 #------------------------------------------------------------------------------- if tl[htl.index('in_linkage_regions')] == '-': penalty_linkage = 50 elif tl[htl.index('in_linkage_regions')] == 'NA' or tl[htl.index('in_linkage_regions')] == '+': pass else: print """Error value in= tl[htl.index('in_linkage_regions')]""", tl[htl.index('in_linkage_regions')] raise Exception #penaltyHomozygousVarGT for local population data if tl[htl.index('Homozygous_VarAllele_UDP_count')]=='-': pass elif int(tl[htl.index('Homozygous_VarAllele_UDP_count')])<=num_affected: pass elif int(tl[htl.index('Homozygous_VarAllele_UDP_count')]) > num_affected: if inheritance_model in ('autozomal-rec','x-linked-rec'): penaltyHomozygousVarGT += (int(tl[htl.index('Homozygous_VarAllele_UDP_count')]) - num_affected) * 1 elif inheritance_model in ('autozomal-dom','compound-heterozygous'): penaltyHomozygousVarGT += (int(tl[htl.index('Homozygous_VarAllele_UDP_count')]) - num_affected) * 10 else: print 'inheritance_model is not recognized' raise Exception else: print 'ERROR: Unexpected value=>hom gt value is',tl[htl.index('Homozygous_VarAllele_UDP_count')] print 'number of affecteds in the family is',num_affected raise Exception #penaltyHomozygousVarGT for ClinSeq data if 'CS_572c_homvar' in htl: if tl[htl.index('CS_572c_homvar')] == '-': pass elif int(tl[htl.index('CS_572c_homvar')]) >= 0: if inheritance_model in ('autozomal-rec','x-linked-rec'): penaltyHomozygousVarGT += int(tl[htl.index('CS_572c_homvar')]) elif inheritance_model in ('autozomal-dom','compound-heterozygous'): penaltyHomozygousVarGT += int(tl[htl.index('CS_572c_homvar')]) * 10 else: print 'inheritance_model is not recognized' raise Exception else: print 'ERROR: Unexpected value=>ClinSeq hom_var_gt value is',tl[htl.index('CS_572c_homvar')] print 'number of affecteds in the family is',num_affected raise Exception else: pass #allele frequency penalties for local population data if population_var_allele_count == 0.0: pass #if 'PRF1' in tl: # print 'tl[htl.index(Single_VarAllele_UDP_count)]',tl[htl.index('Single_VarAllele_UDP_count')] # print 'myvarline.var_allele_count',myvarline.var_allele_count if tl[htl.index('Single_VarAllele_UDP_count')] == '-': pass elif int(tl[htl.index('Single_VarAllele_UDP_count')]) <= int(myvarline.var_allele_count): pass elif int(tl[htl.index('Single_VarAllele_UDP_count')]) > int(myvarline.var_allele_count) : if inheritance_model in ('autozomal-rec','x-linked-rec'): if penaltyHomozygousVarGT == 0.0: penaltyVarAlleleFrequency += float(tl[htl.index('Single_VarAllele_UDP_count')]) else: penaltyVarAlleleFrequency += float(tl[htl.index('Single_VarAllele_UDP_count')]) * 10 elif inheritance_model in ('autozomal-dom','compound-heterozygous'): if int(tl[htl.index('Homozygous_VarAllele_UDP_count')]) <= 0: #if there are no homozygous variants in the population if int(tl[htl.index('Single_VarAllele_UDP_count')]) >= int(myvarline.var_allele_count): penaltyVarAlleleFrequency += (float(tl[htl.index('Single_VarAllele_UDP_count')])- float(myvarline.var_allele_count)) #variant alleles in the population deduct penalty elif int(myvarline.var_allele_count) > int(tl[htl.index('Single_VarAllele_UDP_count')]): #if there are more variant alleles in the family/project than the population => variant allele is private to this family pass else: # if there are homozygous variant genotypes if int(tl[htl.index('Single_VarAllele_UDP_count')]) >= int(myvarline.var_allele_count): penaltyVarAlleleFrequency += ((float(tl[htl.index('Single_VarAllele_UDP_count')])- float(myvarline.var_allele_count))*10) #variant alleles in the population deduct penalty elif int(myvarline.var_allele_count) > int(tl[htl.index('Single_VarAllele_UDP_count')]): #if there are more variant alleles in the family/project than the population => variant allele is private to this family pass else: print 'inheritance_model is not recognized model given is',inheritance_model raise Exception else: print 'ERROR: Unexpected value=>hom gt value is',tl[htl.index('Single_VarAllele_UDP_count')] print 'number of affecteds in the family is',num_affected raise Exception #allele frequency penalties for Clinseq frequencies if 'CS_572c_homvar' in htl: if tl[htl.index('CS_572c_varallele')] == '-' or tl[htl.index('CS_572c_varallele')] == '0': pass elif int(tl[htl.index('CS_572c_varallele')]) > 0: if inheritance_model in ('autozomal-rec','x-linked-rec'): if int(tl[htl.index('CS_572c_homvar')]) == 0: #if there are no homozygous variants in the population penaltyVarAlleleFrequency1 += float(tl[htl.index('CS_572c_varallele')]) else: penaltyVarAlleleFrequency1 += float(tl[htl.index('CS_572c_varallele')]) * 10 elif inheritance_model in ('autozomal-dom','compound-heterozygous'): if int(tl[htl.index('CS_572c_homvar')]) == 0: #if there are no homozygous variants in the population penaltyVarAlleleFrequency1 += float(tl[htl.index('CS_572c_varallele')]) else: # if there are homozygous variant genotypes penaltyVarAlleleFrequency1 += float(tl[htl.index('CS_572c_varallele')]) * 10 else: print 'inheritance_model is not recognized model given is',inheritance_model raise Exception else: print 'ERROR: Unexpected value=> hom gt value is',tl[htl.index('CS_572c_varallele')] print 'number of affecteds in the family is',num_affected raise Exception else: pass penaltyscore = - (penaltyMAF + penaltyG5 + penaltyHomozygousVarGT + penaltyVarAlleleFrequency + penaltyVarAlleleFrequency1 + penalty_linkage) # if -5 > penaltyscore > -10: # print 'penalty score',penaltyscore # print 'single var_alellele_UDP count',tl[htl.index('Single_VarAllele_UDP_count')] # print 'myvarline varallele count',int(myvarline.var_allele_count) if penaltyscore > 0: print 'Positive penalty score found please check function' print 'penaltyMAF={0} penaltyG5={1} penaltyHomozygousVarGT={2} penaltyVarAlleleFrequency={3} penaltyVarAlleleFrequency1={4} penalty_linkage={5}'.format(penaltyMAF, penaltyG5, penaltyHomozygousVarGT, penaltyVarAlleleFrequency, penaltyVarAlleleFrequency1, penalty_linkage) print 'penalty score',penaltyscore if 'CS_572c_homvar' in htl: print 'CS_572c_varallele',tl[htl.index('CS_572c_varallele')] print 'single var_alellele_UDP count',tl[htl.index('Single_VarAllele_UDP_count')] print 'myvarline varallele count',int(myvarline.var_allele_count) raise Exception return (penaltyscore, penaltyMAF, penaltyG5, penaltyHomozygousVarGT, penaltyVarAlleleFrequency, penalty_linkage) def autosomal_recessive_filter(myaffected,mygts,tl1,htl,bed_coords,myfamcode): """ return 0 if not an autozomal recessive variant return 1 if an autozomal recessive variant """ test = False linepass = 1 hom_GTs=('A','G','C','T') het_GTs=('R','Y','K','M','S','W') inbed=0 for coord in bed_coords: try: if (tl1[htl.index('Chr')]==coord[0] and int(tl1[htl.index('LeftFlank')]) >= int(coord[1]) and int(tl1[htl.index('RightFlank')]) <= int(coord[2]) ): inbed=1 except Exception: sys.stderr.write("""There was a problem with the bed coordinates or the file coordinates I was not able to convert the coordinates from the bed file and the coordinates from the input variant file into integers and evaluate the values for bed file annotation. The entries that caused the problem are: """) sys.stderr.write(str(coord)) sys.stderr.write('\n%s,%s,%s\n'%(tl1[htl.index('Chr')],tl1[htl.index('LeftFlank')],tl1[htl.index('RightFlank')])) raise Exception all_affecteds_are_hom_var=1 #all_unaffecteds_are_not_hom_var=1 hemizygous_line=0 mom_hemizygous=0 dad_hemizygous=0 hemizygous_sample='' for k,v in mygts.iteritems(): if test: print k,v if not ( (int(myaffected[k]['affected'])==1 and v[1]=="hom_var") or (int(myaffected[k]['affected'])==0 and v[1]!="hom_var") ): linepass=0 if test: print 'linepass= %s'%linepass if int(myaffected[k]['affected'])==1 and v[1]!='hom_var': all_affecteds_are_hom_var=0 #if int(myaffected[k]['affected'])==0 and v[1]=='hom_var': # all_unaffecteds_are_not_hom_var=0 if test: print 'all_affecteds_are_homvar =%s'%all_affecteds_are_hom_var #print 'all_affecteds_are_not_hom_var =%s'%all_unaffecteds_are_not_hom_var """ if linepass and all_affecteds_are_hom_var: if test: print 'linepassed and allaffecteds are hom var, checking parent hemizygosity' for k,v in mygts.iteritems(): if myaffected[k].get('mom') not in (None,"NA") and myaffected[k].get('dad') not in (None, "NA"): #print "k", k #print 'myaffected[k]',myaffected[k] #print 'myaffected[l].get(mom)', myaffected[k].get('mom') #print 'myaffected[k].get(dad)',myaffected[k].get('dad') if (int(myaffected[k]['affected'])==1 and v[1]=='hom_var' and ( (int(myaffected.get(myaffected[k].get('mom')).get('affected')) == 0 and mygts.get(myaffected[k].get('mom'),[0,0])[1] =='hom_ref') and (int(myaffected.get(myaffected[k].get('dad')).get('affected')) == 0 and mygts.get( myaffected[k].get('dad'),[0,0])[1] == 'het_complete') )): mom_hemizygous=1 if (int(myaffected[k]['affected'])==1 and v[1]=='hom_var' and ((int(myaffected.get(myaffected[k].get('dad')).get('affected')) == 0 and mygts.get(myaffected[k].get('dad'),[0,0])[1] == 'hom_ref') and (int(myaffected.get(myaffected[k].get('mom')).get('affected')) == 0 and mygts.get( myaffected[k].get('mom'),[0,0])[1]=='het_complete') )): dad_hemizygous=1 if mom_hemizygous or dad_hemizygous: hemizygous_line = 1 hemizygous_sample = k if hemizygous_line and inbed: pass # fastafile = pysam.Fastafile(mycfg.hg18fasta) # print '****************I got a real one*********************' # print 'line being evaluated is', tl1 # print 'hemizygous sample is %s '%hemizygous_sample # print 'mom_hemizygous=%s'%mom_hemizygous ,'dad_hemizygous=%s'%dad_hemizygous # interval=10000 # steps=20 # total_called_bases_count=0.0 # hom_GTs_count=0.0 # het_GTs_count=0.0 # other_GTs_count=0.0 #non hom non het gts B,D,H,V,N # called_bases=[] # histograms={} # for udpcode in (hemizygous_sample, myaffected[hemizygous_sample]['mom'],myaffected[hemizygous_sample]['dad']): # print udpcode # bamfile = mycfg.input_bam_files[myfamcode][udpcode] # print udpcode # print bamfile # samfile = pysam.Samfile( bamfile, "rb" ) # fetched= samfile.fetch('chr16',73310465,73310466) # for x in fetched: # print dir(x) # print 'x',str(x),'\n' # time.sleep(1) # # puped= samfile.pileup('chr16',73310465,73310466) # for y in puped: # if y.pos==73310466: # print 'pos is %s and coverage is %s, pileups= %s and tid=%s'%(y.pos,y.n,y.pileups, y.tid) # time.sleep(1) # pileup_iter = samfile.pileup (stepper = 'samtools' , fastafile=fastafile) # snpcaller= pysam.SNPCaller(pileup_iter) # sc= snpcaller.call(chrname,pos) # # histograms[udpcode]=[] # # chrname=tl1[htl.index('Chr')] # pos=int( tl1[htl.index('LeftFlank')]) # leftpos = pos-(interval*steps) # rightpos = leftpos + interval # for step in range(steps*2): # het_GTs_count1=0.0 # leftpos += interval # rightpos += interval # pileup_iter = samfile.pileup (chrname ,leftpos , rightpos, stepper = 'samtools' , fastafile=fastafile) # # snpcall_iter= pysam.IteratorSNPCalls(pileup_iter) # for sc in snpcall_iter: # total_called_bases_count+=1 # # print '\npos',sc.pos # # print 'reference_base',sc.reference_base # # print 'genotype',sc.genotype # # print 'coverage',sc.coverage # # print 'consensus_quality',sc.consensus_quality # # print 'mapping quality',sc.mapping_quality # # print 'snp_quality',sc.snp_quality # # print 'tid',sc.tid # if sc.genotype in hom_GTs: # hom_GTs_count+=1 # elif sc.genotype in het_GTs: # het_GTs_count+=1 # het_GTs_count1+=1 # else: # other_GTs_count+=1 # called_bases.append(sc.genotype) # #print 'for genomic region %s - %s '%(leftpos, rightpos) # #print '%s => Homozygous genotypes count= %s'%(udpcode,hom_GTs_count) # #print '%s => Heterozygous genotypes count= %s'%(udpcode,het_GTs_count) # #print 'other genotypes count= %s'%other_GTs_count # #print '%s => Total %s bases called'%(udpcode,total_called_bases_count) # #print called_bases # #print 'ratio of homozygote GT over tatal calls %s'%(hom_GTs_count/total_called_bases_count) # histograms[udpcode].append(het_GTs_count1) # for udpcode in (hemizygous_sample, myaffected[hemizygous_sample]['mom'],myaffected[hemizygous_sample]['dad']): # print 'histogram for %s'%udpcode # for count in histograms[udpcode]: # print '*'*int(count) else: pass #print 'I got a bogus one' """ #if linepass: # print 'returned linepass is',linepass # if all_affecteds_are_hom_var and all_unaffecteds_are_not_hom_var: # print 'all affecteds are hom var and all unaffecteds are not hom var',tl1 if test: print 'final linepass is %s \n'%linepass return linepass def somatic_mutation_filter(cdnegfile, project_code,**kwargs): """ mendelian_mendfilters hom res, dom, x-linked recessive detection function family.txt file must have the patient codes as UDPxxx.NA or UDPxxx and the format must match for patient code and affected scripts otherwise get affected script will error """ sys.stderr.write('Mendelian filtering started for %s at %s\n'%(cdnegfile, time.ctime())) start=time.time() test=0 mydb = db_class() mydb.create_tables() lenlines =0 myfile = open(cdnegfile) hline = myfile.readline() for x in myfile: lenlines+=1 fmm = open("%s/%smendelian_mismatches.txt"%(os.path.dirname(cdnegfile),os.path.basename(cdnegfile)),"a") fmm.write(os.path.basename(cdnegfile)+'\n') fmm.write(hline) i=0.0 genes=open(mycfg.genes_to_exclude_path,"r").readlines() mydb=db_class() htl = hline.strip().split('\t') htl.insert(htl.index('dbsnp137'),'fam_mpg:cov') htl.insert(htl.index('dbsnp137'),'FIOFWSMIP') #Found In Other Family With Same Mendelian Inheritance Pattern htl.insert(htl.index('dbsnp137'),'in_bed_regions') #Found In Other Family With Same Mendelian Inheritance Pattern htl.insert(htl.index('dbsnp137'),'in_linkage_regions') #Found In Other Family With Same Mendelian Inheritance Pattern htl.insert(htl.index('dbsnp137'),'in_cnv_regions') #Found In Other Family With Same Mendelian Inheritance Pattern htl.insert(htl.index('dbsnp137'),'varmd_autozomal_rec') #Found In Other Family With Same Mendelian Inheritance Pattern htl.insert(htl.index('dbsnp137'),'varmd_score_autozomal_rec') #Found In Other Family With Same Mendelian Inheritance Pattern htl.insert(htl.index('dbsnp137'),'varmd_autozomal_dom') #Found In Other Family With Same Mendelian Inheritance Pattern htl.insert(htl.index('dbsnp137'),'varmd_score_autozomal_dom') #Found In Other Family With Same Mendelian Inheritance Pattern htl.insert(htl.index('dbsnp137'),'varmd_xlinked_rec') #Found In Other Family With Same Mendelian Inheritance Pattern htl.insert(htl.index('dbsnp137'),'varmd_score_xlinked_rec') #Found In Other Family With Same Mendelian Inheritance Pattern bed_coords=[] cnv_bed_coords=[] linkage_bed_coords=[] if kwargs['include_bed_file'] != 'None': bed_coords=[[x.strip('\r\n').split('\t')[0],\ x.strip('\r\n').split('\t')[1],\ x.strip('\r\n').split('\t')[2]]\ for x in open(kwargs['include_bed_file']).readlines()] if kwargs['linkage_bed_file'] != 'None': linkage_bed_coords=[[x.strip('\r\n').split('\t')[0],\ x.strip('\r\n').split('\t')[1],\ x.strip('\r\n').split('\t')[2]]\ for x in open(kwargs['linkage_bed_file']).readlines()] if kwargs['cnv_bed_file'] != 'None': cnv_bed_coords=[[x.strip('\r\n').split('\t')[0],\ x.strip('\r\n').split('\t')[1],\ x.strip('\r\n').split('\t')[2]]\ for x in open(kwargs['cnv_bed_file']).readlines()] #print kwargs #print 'bed_ccords', len(bed_coords) #print 'linkage_bed_coords', len(linkage_bed_coords) #print 'starting calculations', len(cnv_bed_coords) mendfilters=['autozomal-rec','autozomal-dom','x-linked-rec'] #mendfilters=['autozomal-dom'] mendfiltered_list=[] nheader='\t'.join(htl)+'\n' mendfiltered_list.append(nheader) #header line preperation start finallines=[] #list to be sorted before writing to file myvarfile = varfile(htl) myvarfile.get_project_members(project_code) myaffected = myvarfile.project_members first_sample_index = myvarfile.first_sample_index linecounter=0 myfile.seek(0) myfile.readline() #for mendfilter in mendfilters: patient_ids, family_data = p_ids() print ' \t'+'\t'.join(htl) for line in myfile: linecounter += 1 #if linecounter == 5000: # break tl1=line.strip().split("\t") tl1.insert(htl.index('fam_mpg:cov'),'-') tl1.insert(htl.index('FIOFWSMIP'),'-') tl1.insert(htl.index('in_bed_regions'),'NA') tl1.insert(htl.index('in_linkage_regions'),'NA') tl1.insert(htl.index('in_cnv_regions'),'NA') tl1.insert(htl.index('varmd_autozomal_rec'),'') tl1.insert(htl.index('varmd_score_autozomal_rec'),'') tl1.insert(htl.index('varmd_autozomal_dom'),'-') tl1.insert(htl.index('varmd_score_autozomal_dom'),'') tl1.insert(htl.index('varmd_xlinked_rec'),'-') tl1.insert(htl.index('varmd_score_xlinked_rec'),'') if 'CDPred_score' in line: if tl1[htl.index('CDPred_score')] == '301': tl1[htl.index('CDPred_score')] = '0.00001' mygts = get_genotypes(tl1, htl , myaffected, first_sample_index, family_data) linepass_xlinked_rec = 1 all_affecteds_are_hom_var = 1 all_unaffecteds_are_not_hom_var = 1 linepass_autosomal_rec = autosomal_recessive_filter(myaffected, mygts, tl1,htl,cnv_bed_coords, project_code) assert linepass_autosomal_rec in [0,1],('linepass value is not expected') tl1[htl.index('varmd_autozomal_rec')] = str(linepass_autosomal_rec) print '%s\t'%linepass_autosomal_rec+'\t'.join(tl1) #autozomal dominant filter linepass_autosomal_dom = 1 if tl1[htl.index("Chr")] not in ["chrX","chrY","chrM"]: for k, v in mygts.iteritems(): if test: print '\n\n','k=',k ,'\n','v=',v print 'v[4]',v[4] print 'v[5]',v[5],'myaffected[k]',myaffected[k] time.sleep(1) if not( (int(myaffected[k]['affected'])==1 and (v[1]=="het_complete" or v[1]=="var_incomplete")) or (int(myaffected[k]['affected'])==0 and (v[1]=="hom_ref" or v[1]=="ref_incomplete")) ): linepass_autosomal_dom = 0 elif tl1[htl.index("Chr")] == "chrX": for k, v in mygts.iteritems(): #print "x linked dominant test?" #print '\n\n','k',k ,'\n','v',v #print 'family data[k]',family_data[k] #print 'v[4]',v[4] #print 'v[5]',v[5],'myaffected[k]',myaffected[k] #time.sleep(1) if not( (int(myaffected[k]['affected'])==1 and (v[1]=="het_complete" or v[1]=="var_incomplete")) or (int(myaffected[k]['affected'])==0 and (v[1]=="hom_ref" or v[1]=="ref_incomplete")) ): linepass_autosomal_dom = 0 elif tl1[htl.index("Chr")] in ["chrY","chrM"]: pass else: print tl1 print 'chr is not recognized: ',tl1[htl.index("Chr")] raise Exception if test: print 'linepass autosomal dom=',linepass_autosomal_dom tl1[htl.index('varmd_autozomal_dom')] = str(linepass_autosomal_dom) #x-linked reccessive filter for k, v in mygts.iteritems(): if ( tl1[htl.index('Chr')].lower()!='chrX'.lower() or (v[4]=='f' and not ( #(int(myaffected[k][4])==1 and (int(myaffected[k]['affected'])==1 and ( v[1]=="var_incomplete" or v[1]=="var_novel_het" )) or #(int(myaffected[k][4])==0 and (int(myaffected[k]['affected'])==0 and ( v[1]=="hom_ref" or v[1]=="het_complete" or v[1]=="ref_incomplete" or v[1]=="ref_novel_het" )) )) or #if a male is affected his genotype should have a single allele of variant #if he is not affected his genotype should be either hom_ref of ref_incomplete #with no variation at all present #if these criteri are not met than the line fails to pass the filter #if male and not ( affected and gt==var_incomplete) or (not affected and gt==either hom_ref or ref_incomplete) #line doesn't pass filter (v[4]=='m' and #not( (int(myaffected[k][4])==1 and not( (int(myaffected[k]['affected'])==1 and ( #v[1]=="het_complete" or v[1]=="var_incomplete" ) ) or #(int(myaffected[k][4])==0 and (int(myaffected[k]['affected'])==0 and ( v[1]=="hom_ref" or v[1]=="ref_incomplete" ) ))) ): linepass_xlinked_rec = 0 tl1[htl.index('varmd_xlinked_rec')] = str(linepass_xlinked_rec) #if all_affecteds_are_hom_var and all_unaffecteds_are_not_hom_var: # print tl1 #print 'linepass',linepass #if linepass==1: # print 'line passed' #elif linepass==0: # print 'line did not pass' genepass = 1 file_gene = '' if 'refseq' in hline: file_gene = tl1[htl.index('refseq')].lower() elif 'ss_geneList' in hline and (not 'refseq' in hline): file_gene = tl1[htl.index('ss_geneList')].lower() for item in genes: gene=item.strip() delim=';' if delim in item: gene=item.split(delim)[0] #print 'gene from line',tl1[htl.index('refseq')] #print 'gene from list',gene #time.sleep(1) if file_gene == gene.lower(): #print 'found gene' #print 'tl1 gene',tl1[htl.index('refseq')].lower() #print gene.lower() #time.sleep(1) genepass=0 else: #print 'not found gene' #print 'tl1 gene',tl1[htl.index('refseq')].lower() #print gene.lower() pass my_mpg_score = mpg_scores(tl1 , htl) #HWE_wrong=check_HWE(line,cdnegfile) #if 'PRF1' in line: if test: print tl1 print 'linepass_autosomal_rec',linepass_autosomal_rec print 'linepass_autosomal_dom',linepass_autosomal_dom print 'linepass_xlinked_rec',linepass_xlinked_rec print 'genepass',genepass print 'my_mpg_score %s \n'%str(my_mpg_score) time.sleep(1) if linepass_autosomal_rec == 0: pass #print '\nline that did not pass autozomal recessive filter\n',tl1 #time.sleep(1) if ( (linepass_autosomal_rec or linepass_autosomal_dom or linepass_xlinked_rec) and genepass and my_mpg_score>0.5 and 'random' not in tl1[htl.index('Chr')].lower() ): # if line meets the Mendelian criteria and the gene is not an unimportant gene and mpg/coverage is good (>0.4) in at least half of the individuals #print "hom res match found\n",tl1,"\n" if kwargs['include_bed_file']!='None': tl1[htl.index('in_bed_regions')]='-' for coord in bed_coords: if (tl1[htl.index('Chr')]==coord[0] and int(tl1[htl.index('LeftFlank')]) >= int(coord[1]) and int(tl1[htl.index('RightFlank')]) <= int(coord[2]) ): tl1[htl.index('in_bed_regions')]='+' if kwargs['linkage_bed_file']!='None': tl1[htl.index('in_linkage_regions')]='-' for coord in linkage_bed_coords: if (tl1[htl.index('Chr')]==coord[0] and int(tl1[htl.index('LeftFlank')]) >= int(coord[1]) and int(tl1[htl.index('RightFlank')]) <= int(coord[2]) ): tl1[htl.index('in_linkage_regions')]='+' if kwargs['cnv_bed_file']!='None': tl1[htl.index('in_cnv_regions')]='-' for coord in cnv_bed_coords: if (tl1[htl.index('Chr')]==coord[0] and int(tl1[htl.index('LeftFlank')]) >= int(coord[1]) and int(tl1[htl.index('RightFlank')]) <= int(coord[2]) ): tl1[htl.index('in_cnv_regions')]='+' #it is important to keep the linkage_region annotation before the penalty score calculation #as this information is used in the penalty score calculations penaltyscore_autozomal_rec = penalty_score(tl1,htl,cdnegfile,"autozomal-rec", project_code) penaltyscore_autozomal_dom = penalty_score(tl1,htl,cdnegfile,"autozomal-dom", project_code) penaltyscore_xlinked_rec = penalty_score(tl1,htl,cdnegfile,"x-linked-rec", project_code) #tl1[htl.index('varscore')]=str(float(tl1[htl.index('varscore')])-penaltyscore) tl1[htl.index('varmd_score_autozomal_rec')] = str(penaltyscore_autozomal_rec[0]) tl1[htl.index('varmd_score_autozomal_dom')] = str(penaltyscore_autozomal_dom[0]) tl1[htl.index('varmd_score_xlinked_rec')] = str(penaltyscore_xlinked_rec[0]) tl1[htl.index('varscore')] = str(max( penaltyscore_autozomal_rec[0], penaltyscore_autozomal_dom[0], penaltyscore_xlinked_rec[0]) ) #debug remove tl1[htl.index('fam_mpg:cov')] = str(my_mpg_score) #add mpg:coverage score to the file finallines.append(tl1) #ATTENTION #database driven found in other families with same mendelian model penalty, too slow, #the database function works fine but it is slow, try to improve speed and reenable it if 'CDPred_score' in hline and tl1[htl.index('CDPred_score')]=='-': tl1[htl.index('CDPred_score')] = '0.001' #not zero to differentiate from real zero values but almost zero in value # time.sleep(5) #elif resultdict['foundsameflag'] == True: # print 'found same variant in same family; previously entered varint row passing' # print resultdict['foundsamerow'] if linecounter % 50000 == 0: sys.stderr.write('line %s of %s is processed\n'%(linecounter,lenlines)) #determines mismatches between my algorithm and nancies if 'MendHomRec' in hline: if int(tl1[htl.index('MendHomRec')]) == 1 and linepass_autosomal_rec == 0: #print 'tl1',tl1 fmm.write('Autozomal recessive mismatch:\n') fmm.write("\t".join(tl1)+'\n') if 'MendDom' in hline: if int(tl1[htl.index('MendDom')]) == 1 and linepass_autosomal_dom == 0: #print 'tl1',tl1 fmm.write('Autozomal dominant mismatch:\n') fmm.write("\t".join(tl1)+'\n') if 'CDPred_score' in hline: s_finallines = sorted(finallines, key = lambda line:(float(line[htl.index('varscore')]),abs(float(line[htl.index('CDPred_score')])) ), reverse=True) else: s_finallines = sorted(finallines, key=lambda line:(float(line[htl.index('varscore')])), reverse=True) for tabline in s_finallines: mendfiltered_list.append("\t".join(tabline)+"\n") file_path = args.output_file #close_variants(mendfiltered_list) #raise Exception write_lines(file_path, mendfiltered_list) print 'Mendelian analysis output:{}'.format(file_path) fstats=open("%s/analysis_stats.txt"%os.path.dirname(cdnegfile),"a")# write file stats fstats.write("The number of variant lines that passed (autozomal_rec, autozomal_dom, xlinked_rec are:\t{} \n".format( len(mendfiltered_list)-1, )) fstats.close() end=time.time() print "Elapsed time = {} min".format((end-start)/60 ) fmm.close() def simple_mendelian_filters(cdnegfile, project_code,**kwargs): """ mendelian_mendfilters hom res, dom, x-linked recessive detection function family.txt file must have the patient codes as UDPxxx.NA or UDPxxx and the format must match for patient code and affected scripts otherwise get affected script will error This function annotates the file with coordinates from upto three bed files. The coordinates in the bed files must be hg19 based. """ sys.stderr.write('Mendelian filtering started for %s at %s\n'%(cdnegfile, time.ctime())) start=time.time() test=0 mydb = db_class() mydb.create_tables() lenlines =0 myfile = open(cdnegfile) hline = myfile.readline() for x in myfile: lenlines+=1 fmm = open("%s/%smendelian_mismatches.txt"%(os.path.dirname(cdnegfile),os.path.basename(cdnegfile)),"a") fmm.write(os.path.basename(cdnegfile)+'\n') fmm.write(hline) i=0.0 genes=open(mycfg.genes_to_exclude_path,"r").readlines() mydb=db_class() htl = hline.strip().split('\t') htl.insert(htl.index('dbsnp137'),'fam_mpg:cov') htl.insert(htl.index('dbsnp137'),'FIOFWSMIP') #Found In Other Family With Same Mendelian Inheritance Pattern htl.insert(htl.index('dbsnp137'),'in_bed_regions') #Found In Other Family With Same Mendelian Inheritance Pattern htl.insert(htl.index('dbsnp137'),'in_linkage_regions') #Found In Other Family With Same Mendelian Inheritance Pattern htl.insert(htl.index('dbsnp137'),'in_cnv_regions') #Found In Other Family With Same Mendelian Inheritance Pattern htl.insert(htl.index('dbsnp137'),'varmd_autozomal_rec') #Found In Other Family With Same Mendelian Inheritance Pattern htl.insert(htl.index('dbsnp137'),'varmd_score_autozomal_rec') #Found In Other Family With Same Mendelian Inheritance Pattern htl.insert(htl.index('dbsnp137'),'varmd_autozomal_dom') #Found In Other Family With Same Mendelian Inheritance Pattern htl.insert(htl.index('dbsnp137'),'varmd_score_autozomal_dom') #Found In Other Family With Same Mendelian Inheritance Pattern htl.insert(htl.index('dbsnp137'),'varmd_xlinked_rec') #Found In Other Family With Same Mendelian Inheritance Pattern htl.insert(htl.index('dbsnp137'),'varmd_score_xlinked_rec') #Found In Other Family With Same Mendelian Inheritance Pattern bed_coords=[] cnv_bed_coords=[] linkage_bed_coords=[] if kwargs['include_bed_file'] != 'None': bed_coords=[[x.strip('\r\n').split('\t')[0],\ x.strip('\r\n').split('\t')[1],\ x.strip('\r\n').split('\t')[2]]\ for x in open(kwargs['include_bed_file']).readlines()] if kwargs['linkage_bed_file'] != 'None': linkage_bed_coords=[[x.strip('\r\n').split('\t')[0],\ x.strip('\r\n').split('\t')[1],\ x.strip('\r\n').split('\t')[2]]\ for x in open(kwargs['linkage_bed_file']).readlines()] if kwargs['cnv_bed_file'] != 'None': cnv_bed_coords=[[x.strip('\r\n').split('\t')[0],\ x.strip('\r\n').split('\t')[1],\ x.strip('\r\n').split('\t')[2]]\ for x in open(kwargs['cnv_bed_file']).readlines()] #print kwargs #print 'bed_ccords', len(bed_coords) #print 'linkage_bed_coords', len(linkage_bed_coords) #print 'starting calculations', len(cnv_bed_coords) mendfilters=['autozomal-rec','autozomal-dom','x-linked-rec'] #mendfilters=['autozomal-dom'] mendfiltered_list=[] nheader='\t'.join(htl)+'\n' mendfiltered_list.append(nheader) #header line preperation start finallines=[] #list to be sorted before writing to file myvarfile = varfile(htl) myvarfile.get_project_members(project_code) myaffected = myvarfile.project_members first_sample_index = myvarfile.first_sample_index linecounter=0 myfile.seek(0) myfile.readline() #for mendfilter in mendfilters: patient_ids, family_data = p_ids() #print ' \t'+'\t'.join(htl) gt_qual_scores ={} for line in myfile: linecounter += 1 #if linecounter == 5000: # break tl1=line.strip().split("\t") tl1.insert(htl.index('fam_mpg:cov'),'-') tl1.insert(htl.index('FIOFWSMIP'),'-') tl1.insert(htl.index('in_bed_regions'),'NA') tl1.insert(htl.index('in_linkage_regions'),'NA') tl1.insert(htl.index('in_cnv_regions'),'NA') tl1.insert(htl.index('varmd_autozomal_rec'),'') tl1.insert(htl.index('varmd_score_autozomal_rec'),'') tl1.insert(htl.index('varmd_autozomal_dom'),'-') tl1.insert(htl.index('varmd_score_autozomal_dom'),'') tl1.insert(htl.index('varmd_xlinked_rec'),'-') tl1.insert(htl.index('varmd_score_xlinked_rec'),'') if 'CDPred_score' in line: if tl1[htl.index('CDPred_score')] == '301': tl1[htl.index('CDPred_score')] = '0.00001' mygts = get_genotypes(tl1, htl , myaffected, first_sample_index, family_data) linepass_xlinked_rec = 1 all_affecteds_are_hom_var = 1 all_unaffecteds_are_not_hom_var = 1 linepass_autosomal_rec = autosomal_recessive_filter(myaffected, mygts, tl1,htl,cnv_bed_coords, project_code) assert linepass_autosomal_rec in [0,1],('linepass value is not expected') tl1[htl.index('varmd_autozomal_rec')] = str(linepass_autosomal_rec) #autozomal dominant filter linepass_autosomal_dom = 1 if tl1[htl.index("Chr")] not in ["chrX","chrY","chrM"]: for k, v in mygts.iteritems(): if test: print '\n\n','k=',k ,'\n','v=',v print 'v[4]',v[4] print 'v[5]',v[5],'myaffected[k]',myaffected[k] time.sleep(1) if not( (int(myaffected[k]['affected'])==1 and (v[1]=="het_complete" or v[1]=="var_incomplete")) or (int(myaffected[k]['affected'])==0 and (v[1]=="hom_ref" or v[1]=="ref_incomplete")) ): linepass_autosomal_dom = 0 elif tl1[htl.index("Chr")] == "chrX": for k, v in mygts.iteritems(): #print "x linked dominant test?" #print '\n\n','k',k ,'\n','v',v #print 'family data[k]',family_data[k] #print 'v[4]',v[4] #print 'v[5]',v[5],'myaffected[k]',myaffected[k] #time.sleep(1) if not( (int(myaffected[k]['affected'])==1 and (v[1]=="het_complete" or v[1]=="var_incomplete")) or (int(myaffected[k]['affected'])==0 and (v[1]=="hom_ref" or v[1]=="ref_incomplete")) ): linepass_autosomal_dom = 0 elif tl1[htl.index("Chr")] in ["chrY","chrM"]: pass else: print tl1 print 'chr is not recognized: ',tl1[htl.index("Chr")] raise Exception if test: print 'linepass autosomal dom=',linepass_autosomal_dom tl1[htl.index('varmd_autozomal_dom')] = str(linepass_autosomal_dom) #x-linked reccessive filter for k, v in mygts.iteritems(): if ( tl1[htl.index('Chr')].lower()!='chrX'.lower() or (v[4]=='f' and not ( #(int(myaffected[k][4])==1 and (int(myaffected[k]['affected'])==1 and ( v[1]=="var_incomplete" or v[1]=="var_novel_het" )) or #(int(myaffected[k][4])==0 and (int(myaffected[k]['affected'])==0 and ( v[1]=="hom_ref" or v[1]=="het_complete" or v[1]=="ref_incomplete" or v[1]=="ref_novel_het" )) )) or #if a male is affected his genotype should have a single allele of variant #if he is not affected his genotype should be either hom_ref of ref_incomplete #with no variation at all present #if these criteri are not met than the line fails to pass the filter #if male and not ( affected and gt==var_incomplete) or (not affected and gt==either hom_ref or ref_incomplete) #line doesn't pass filter (v[4]=='m' and #not( (int(myaffected[k][4])==1 and not( (int(myaffected[k]['affected'])==1 and ( #v[1]=="het_complete" or v[1]=="var_incomplete" ) ) or #(int(myaffected[k][4])==0 and (int(myaffected[k]['affected'])==0 and ( v[1]=="hom_ref" or v[1]=="ref_incomplete" ) ))) ): linepass_xlinked_rec = 0 tl1[htl.index('varmd_xlinked_rec')] = str(linepass_xlinked_rec) #if all_affecteds_are_hom_var and all_unaffecteds_are_not_hom_var: # print tl1 #print 'linepass',linepass #if linepass==1: # print 'line passed' #elif linepass==0: # print 'line did not pass' genepass = 1 file_gene = '' if 'refseq' in hline: file_gene = tl1[htl.index('refseq')].lower() elif 'ss_geneList' in hline and (not 'refseq' in hline): file_gene = tl1[htl.index('ss_geneList')].lower() for item in genes: gene=item.strip() delim=';' if delim in item: gene=item.split(delim)[0] #print 'gene from line',tl1[htl.index('refseq')] #print 'gene from list',gene #time.sleep(1) if file_gene == gene.lower(): #print 'found gene' #print 'tl1 gene',tl1[htl.index('refseq')].lower() #print gene.lower() #time.sleep(1) genepass=0 else: #print 'not found gene' #print 'tl1 gene',tl1[htl.index('refseq')].lower() #print gene.lower() pass my_mpg_score = mpg_scores(tl1 , htl) if not gt_qual_scores.get(my_mpg_score): gt_qual_scores[my_mpg_score] = 1 else: gt_qual_scores[my_mpg_score] += 1 #HWE_wrong=check_HWE(line,cdnegfile) #if 'PRF1' in line: if test and (linepass_autosomal_rec or linepass_autosomal_dom or linepass_xlinked_rec): print tl1 print 'linepass_autosomal_rec',linepass_autosomal_rec print 'linepass_autosomal_dom',linepass_autosomal_dom print 'linepass_xlinked_rec',linepass_xlinked_rec print 'genepass',genepass print 'my_mpg_score %s \n'%str(my_mpg_score) time.sleep(1) if ( (linepass_autosomal_rec or linepass_autosomal_dom or linepass_xlinked_rec) and genepass and my_mpg_score > 0.5 and 'random' not in tl1[htl.index('Chr')].lower() ): # if line meets the Mendelian criteria and the gene is not an unimportant gene and mpg/coverage is good (>0.4) in at least half of the individuals #print "hom res match found\n",tl1,"\n" if kwargs['include_bed_file']!='None': tl1[htl.index('in_bed_regions')]= '-' for coord in bed_coords: if (tl1[htl.index('Chr')]==coord[0] and int(tl1[htl.index('hg19LeftFlank')]) >= int(coord[1]) and int(tl1[htl.index('hg19RightFlank')]) <= int(coord[2]) ): tl1[htl.index('in_bed_regions')]='+' if kwargs['linkage_bed_file']!='None': tl1[htl.index('in_linkage_regions')]='-' for coord in linkage_bed_coords: if (tl1[htl.index('Chr')]==coord[0] and int(tl1[htl.index('hg19LeftFlank')]) >= int(coord[1]) and int(tl1[htl.index('hg19RightFlank')]) <= int(coord[2]) ): tl1[htl.index('in_linkage_regions')]='+' if kwargs['cnv_bed_file']!='None': tl1[htl.index('in_cnv_regions')]='-' for coord in cnv_bed_coords: if (tl1[htl.index('Chr')]==coord[0] and int(tl1[htl.index('hg19LeftFlank')]) >= int(coord[1]) and int(tl1[htl.index('hg19RightFlank')]) <= int(coord[2]) ): tl1[htl.index('in_cnv_regions')]='+' #it is important to keep the linkage_region annotation before the penalty score calculation #as this information is used in the penalty score calculations penaltyscore_autozomal_rec = penalty_score(tl1,htl,cdnegfile,"autozomal-rec", project_code) penaltyscore_autozomal_dom = penalty_score(tl1,htl,cdnegfile,"autozomal-dom", project_code) penaltyscore_xlinked_rec = penalty_score(tl1,htl,cdnegfile,"x-linked-rec", project_code) #tl1[htl.index('varscore')]=str(float(tl1[htl.index('varscore')])-penaltyscore) tl1[htl.index('varmd_score_autozomal_rec')] = str(penaltyscore_autozomal_rec[0]) tl1[htl.index('varmd_score_autozomal_dom')] = str(penaltyscore_autozomal_dom[0]) tl1[htl.index('varmd_score_xlinked_rec')] = str(penaltyscore_xlinked_rec[0]) tl1[htl.index('varscore')] = str(max( penaltyscore_autozomal_rec[0], penaltyscore_autozomal_dom[0], penaltyscore_xlinked_rec[0]) ) #debug remove tl1[htl.index('fam_mpg:cov')] = str(my_mpg_score) #add mpg:coverage score to the file finallines.append(tl1) #ATTENTION #database driven found in other families with same mendelian model penalty, too slow, #the database function works fine but it is slow, try to improve speed and reenable it # fiof_penalty=0 # if linepass_autosomal_rec: # resultdict = mydb.check_and_add_db(headerline = htl, # variantline1 = tl1, # project_code = project_code, # mendtype = "autozomal_rec" # ) # if linepass_autosomal_dom: # resultdict = mydb.check_and_add_db(headerline = htl, # variantline1 = tl1, # project_code = project_code, # mendtype = "autozomal_dom" # ) # if linepass_xlinked_rec: # resultdict = mydb.check_and_add_db(headerline = htl, # variantline1 = tl1, # project_code = project_code, # mendtype = "xlinked_rec" # ) # # if resultdict['founddifferentflag'] == True: # #print 'found same variant in different family==>',resultdict['founddifferentrow'] # #print 'line in this family is',tl1 # for item in resultdict['founddifferentrow']: # tl1[htl.index('FIOFWSMIP')]+='%s,'%item[1] # fiof_penalty -= 100 # tl1[htl.index('varscore')] = str((float(tl1[htl.index('varscore')])+ fiof_penalty)) if 'CDPred_score' in hline and tl1[htl.index('CDPred_score')]=='-': tl1[htl.index('CDPred_score')] = '0.001' #not zero to differentiate from real zero values but almost zero in value # time.sleep(5) #elif resultdict['foundsameflag'] == True: # print 'found same variant in same family; previously entered varint row passing' # print resultdict['foundsamerow'] if linecounter % 50000 == 0: sys.stderr.write('line %s of %s is processed\n'%(linecounter,lenlines)) #determines mismatches between my algorithm and nancies if 'MendHomRec' in hline: if int(tl1[htl.index('MendHomRec')]) == 1 and linepass_autosomal_rec == 0: #print 'tl1',tl1 fmm.write('Autozomal recessive mismatch:\n') fmm.write("\t".join(tl1)+'\n') if 'MendDom' in hline: if int(tl1[htl.index('MendDom')]) == 1 and linepass_autosomal_dom == 0: #print 'tl1',tl1 fmm.write('Autozomal dominant mismatch:\n') fmm.write("\t".join(tl1)+'\n') for k,v in gt_qual_scores.iteritems(): print k,v print 'len(finallines)',len(finallines) print linecounter if 'CDPred_score' in hline: s_finallines = sorted(finallines, key = lambda line:(float(line[htl.index('varscore')]),abs(float(line[htl.index('CDPred_score')])) ), reverse=True) else: s_finallines = sorted(finallines, key=lambda line:(float(line[htl.index('varscore')])), reverse=True) for tabline in s_finallines: mendfiltered_list.append("\t".join(tabline)+"\n") file_path = args.output_file #close_variants(mendfiltered_list) #raise Exception write_lines(file_path, mendfiltered_list) print 'Mendelian analysis output:{}'.format(file_path) fstats=open("%s/analysis_stats.txt"%os.path.dirname(cdnegfile),"a")# write file stats fstats.write("The number of variant lines that passed (autozomal_rec, autozomal_dom, xlinked_rec are:\t{} \n".format( len(mendfiltered_list)-1, )) fstats.close() end=time.time() print "Elapsed time = {} min".format((end-start)/60 ) fmm.close() def mendelian_filter_com_het_res(cdnegfile, outputfile, project_code,**kwargs): """ this function compares two lines and finds compund heterozygotes if genotypes are similar but affected status don't match it should eliminate these false positive results such as AG AG AG AG although G is variant since it is found in unaffecteds in geterozygote state it should be eliminated from the result set """ test=0 sys.stderr.write('Compound heterozygous mendelian filtering started for %s at %s\n'%(cdnegfile, time.ctime())) lineslist = open(cdnegfile).readlines() start=time.time() i=0.0 mydb = db_class() mydb.create_tables() htl=lineslist[0].strip().split("\t") myvarfile = varfile(htl) com_het_res=[] htl.insert(htl.index('dbsnp137'),'fam_mpg:cov') htl.insert(htl.index('dbsnp137'),'FIOFWSMIP') #Found In Other Family With Same Mendelian Inheritance Pattern htl.insert(htl.index('dbsnp137'),'in_bed_regions') #Found In Other Family With Same Mendelian Inheritance Pattern htl.insert(htl.index('dbsnp137'),'in_linkage_regions') #Found In Other Family With Same Mendelian Inheritance Pattern htl.insert(htl.index('dbsnp137'),'in_cnv_regions') #Found In Other Family With Same Mendelian Inheritance Pattern nheader='\t'.join(htl)+'\n' lineslist1=deepcopy(lineslist[1:]) lineslist2=deepcopy(lineslist[1:]) numlines = len(lineslist1) myvarfile.get_project_members(project_code) myaffected = myvarfile.project_members #for k,v in myaffected.iteritems(): # print "key = %s , value=%s"%( k,v) com_het_res.append(nheader) #add the headerline to the file as the first line genes_toexclude = open(mycfg.genes_to_exclude_path,"r").readlines() controllines={} #controllines dictionary has each gene as key and all variation lines for that gene as values for line1 in lineslist1: tl1=line1.strip().split("\t") tl1.insert(htl.index('fam_mpg:cov'),'-') tl1.insert(htl.index('FIOFWSMIP'),'-') #Found In Other Family With Same Mendelian Inheritance Pattern tl1.insert(htl.index('in_bed_regions'),'NA') tl1.insert(htl.index('in_linkage_regions'),'NA') tl1.insert(htl.index('in_cnv_regions'),'NA') file_gene='' if 'refseq' in lineslist[0]: file_gene = tl1[htl.index('refseq')].lower() elif not ('refseq' in lineslist[0]) and ('Gene_name' in lineslist[0]): file_gene = tl1[htl.index('Gene_name')].lower() elif not ('refseq' in lineslist[0]) and not ('Gene_name' in lineslist[0]) and 'ss_geneList' in lineslist[0]: file_gene = tl1[htl.index('ss_geneList')].lower() else: raise Exception if not file_gene in controllines: controllines[file_gene] = [line1] elif file_gene in controllines: controllines[file_gene].append(line1) #print '\n\n controller:',controllines #time.sleep(1) bed_coords=[] cnv_bed_coords=[] linkage_bed_coords=[] if kwargs['include_bed_file'] != 'None': bed_coords=[[x.strip('\r\n').split('\t')[0],\ x.strip('\r\n').split('\t')[1],\ x.strip('\r\n').split('\t')[2]]\ for x in open(kwargs['include_bed_file']).readlines()] if kwargs['linkage_bed_file'] != 'None': linkage_bed_coords=[[x.strip('\r\n').split('\t')[0],\ x.strip('\r\n').split('\t')[1],\ x.strip('\r\n').split('\t')[2]]\ for x in open(kwargs['linkage_bed_file']).readlines()] if kwargs['cnv_bed_file'] != 'None': cnv_bed_coords=[[x.strip('\r\n').split('\t')[0],\ x.strip('\r\n').split('\t')[1],\ x.strip('\r\n').split('\t')[2]]\ for x in open(kwargs['cnv_bed_file']).readlines()] linestosort=[] first_sample_index = myvarfile.first_sample_index patient_ids, family_data = p_ids() if myvarfile.total_num_samples == 1: sys.stderr.write('Only a single subjects variants are found, skipping compound het analysis\n') sys.stderr.write('Please re-run compound het analysis is more members\' data are added to the file\n') else: for line1 in lineslist1: i+=1 pt_d1={} tl1=line1.strip().split("\t") tl1.insert(htl.index('fam_mpg:cov'),'-') tl1.insert(htl.index('FIOFWSMIP'),'-') #Found In Other Family With Same Mendelian Inheritance Pattern tl1.insert(htl.index('in_bed_regions'),'NA') tl1.insert(htl.index('in_linkage_regions'),'NA') tl1.insert(htl.index('in_cnv_regions'),'NA') file_gene = tl1[htl.index('Gene_name')].lower() if file_gene in ['NA'.lower(),'none','-']: #this prevents 'NA' intergenic variants from pairing with arbitrary other continue gts1 = get_genotypes(tl1 , htl , myaffected, first_sample_index, family_data) #pos 0 :pass1, pos1: pass 2, pos2: affected status, pos3: first #line pt_ raw genotype, Pos4:line 1 pt calculated genotype, #pos5:second line pt_raw genotype, pos6: second line pt calculated genotype if test: print '\ntesting gts dictionary output in com_het_res mendfilter' print 'gts1',gts1 time.sleep(0.2) #if genename==tl1[htl.index('refseq')]: # print 'index',tl1[0] # print 'in com het res function ','k=',k,'v=',v for k,v in gts1.items(): #print k,v pt_d1[k]=[int(myaffected[k]['affected']),v[0],v[1]] #v[0]= genotype v[1]= genotype class v[2]=mpg_score v[3]=coverage v[4]=gender pt_d1[k].append('index=%s'%tl1[htl.index('Index')]) #print pt_d1[k] #raise Exception #if genename==tl1[htl.index('refseq')]:print '\n' linepass1=1 for k,v in pt_d1.items(): if not (((v[0]==1 and (v[2]=="het_complete" or v[2]=="var_incomplete"))) or ( v[0]==0 and (v[2]=="hom_ref" or v[2]=="ref_incomplete" or v[2]=="het_complete" ))): linepass1 = 0 genepass=1 for item in genes_toexclude: gene=item.strip() delim=';' if delim in item: gene = item.split(delim)[0] #print 'gene is',gene if file_gene == gene.lower(): #if genename==tl1[htl.index('refseq')]: # print "it's a bogus gene therefore it is eliminated from the results" genepass=0 my_mpg_score = mpg_scores(tl1, htl) tl1[htl.index('fam_mpg:cov')] = str(my_mpg_score) #add mpg:coverage score to the line if test: print 'mpg_score is',my_mpg_score #if genename==tl1[htl.index('refseq')]: # print 'mpg_score is',mpg_score if ( linepass1 and genepass and my_mpg_score>0.5 and ('random' not in tl1[htl.index('Chr')].lower()) ): # if line meets the Mendelian criteria and the gene is not an #unimportant gene and mpg/coverage is good (>0.4) in at least half of the individuals and for line2 in controllines[file_gene]: pt_d2={} tl2=line2.strip().split("\t") tl2.insert(htl.index('fam_mpg:cov'),'-') tl2.insert(htl.index('FIOFWSMIP'),'-') #Found In Other Family With Same Mendelian Inheritance Pattern tl2.insert(htl.index('in_bed_regions'),'NA') tl2.insert(htl.index('in_linkage_regions'),'NA') tl2.insert(htl.index('in_cnv_regions'),'NA') mpg_score2 = mpg_scores(tl2, htl) #returns a float mpg_score tl2[htl.index('fam_mpg:cov')] = str(mpg_score2) #add mpg:coverage score to the line gts2 = get_genotypes(tl2, htl, myaffected, first_sample_index, family_data) if test: print 'second gts',gts2 linepass2 = 1 #reset the pass status for second line for k,v in gts2.items(): #pt_d2[k]=[int(myaffected[k][4]),v[0],v[1]]#v[0]= v[1]= v[2]=affected status v[4]=genotype v[5]=genotype class pt_d2[k]=[int(myaffected[k]['affected']),v[0],v[1]]#v[0]= v[1]= v[2]=affected status v[4]=genotype v[5]=genotype class pt_d2[k].append('index=%s'%tl2[htl.index('Index')]) if test: print 'gts1',gts1 print 'gts2',gts2 print 'pt_d2[k]',pt_d2.items() print 'index 1',tl1[htl.index('Index')] print 'index 2',tl2[htl.index('Index')] for k,v in pt_d2.items(): if not (( (v[0]==1 and (v[2]=="het_complete" or v[2]=="var_incomplete")) ) or (v[0]==0 and (v[2]=="hom_ref" or v[2]=="ref_incomplete" or v[2]=="het_complete" )) ): #these are to check if patients and unaffecteds genotypes are not matching exactly linepass2=0 else: pass #pt_d1[k].append(pt_d2[k]) #if genename==tl1[htl.index('refseq')]: # print 'index of second line based on gene name before pass',tl2[0] if (tl1!=tl2 and mpg_score2>0.5 and linepass2): #line meets the Mendelian criteria and the gene is not an unimportant gene and #mpg/coverage is good (>0.4) in at least half of the individuals linespass=1 #print 'all passed, before final check pt_d1=',pt_d1.items() #print 'all passed, before final check pt_d2=',pt_d2.items() #time.sleep(3) #print 'initial linespassed' #if genename==tl1[htl.index('refseq')]: # print 'index of second line based on gene name after pass',tl2[0] for k1,v1 in pt_d1.items(): for k2,v2 in pt_d2.items(): if k1!=k2 and v1[0]!=v2[0]:#for different pweople whose affected status are different as well if v1[1]==v2[1]:# their genotypes match linespass=0 elif (pt_d1[k1][1],pt_d2[k1][1]) == (pt_d1[k2][1],pt_d2[k2][1]): linespass=0 """ print '\nboth lines passed , not same lines, same gene, same transcript, good mpg scores' print 'line1passed linepass1=%s genepass=%s mpg_score1=%s'%(linepass1,genepass,mpg_score) print 'line2passed linepass2=%s mpg_score2=%s'%(linepass1,mpg_score2) time.sleep(1) """ if linespass: duplicateflag=0 #print '\nall passed, after final check, before write pt_d1=',pt_d1.items() #print 'all passed, after final check, before write pt_d2=',pt_d2.items() #time.sleep(1) #if genename==tl1[htl.index('refseq')]: # print "\t".join(tl1)+"\n"+"\t".join(tl2)+"\n\n" # print tl1 # print tl2 # time.sleep(1) if len(linestosort)>=1: for x in linestosort: #print 'x and linestosort index[x]',x,linestosort[x] #print 'tl1=>',tl1 #print 'tl2=>',tl2 #print '\n' #time.sleep(1) if (x[2][htl.index('Index')] , x[3][htl.index('Index')]) == (tl2[htl.index('Index')],tl1[htl.index('Index')]): #print '\n\nfound duplicate' #print (linestosort[x][2][htl.index('Index')] , linestosort[x][3][htl.index('Index')]), (tl2[htl.index('Index')],tl1[htl.index('Index')]) #time.sleep(2) duplicateflag=1 if not duplicateflag: if kwargs['include_bed_file']!='None': tl1[htl.index('in_bed_regions')]='-' tl2[htl.index('in_bed_regions')]='-' for coord in bed_coords: if (tl1[htl.index('Chr')]==coord[0] and int(tl1[htl.index('LeftFlank')]) >= int(coord[1]) and int(tl1[htl.index('RightFlank')]) <= int(coord[2]) ): tl1[htl.index('in_bed_regions')]='+' if (tl2[htl.index('Chr')]==coord[0] and int(tl2[htl.index('LeftFlank')]) >= int(coord[1]) and int(tl2[htl.index('RightFlank')]) <= int(coord[2]) ): tl2[htl.index('in_bed_regions')]='+' if kwargs['linkage_bed_file']!='None': tl1[htl.index('in_linkage_regions')]='-' tl2[htl.index('in_linkage_regions')]='-' for coord in linkage_bed_coords: if (tl1[htl.index('Chr')]==coord[0] and int(tl1[htl.index('LeftFlank')]) >= int(coord[1]) and int(tl1[htl.index('RightFlank')]) <= int(coord[2]) ): tl1[htl.index('in_linkage_regions')]='+' if (tl2[htl.index('Chr')]==coord[0] and int(tl2[htl.index('LeftFlank')]) >= int(coord[1]) and int(tl2[htl.index('RightFlank')]) <= int(coord[2]) ): tl2[htl.index('in_linkage_regions')]='+' if kwargs['cnv_bed_file']!='None': tl1[htl.index('in_cnv_regions')]='-' tl2[htl.index('in_cnv_regions')]='-' for coord in cnv_bed_coords: if (tl1[htl.index('Chr')]==coord[0] and int(tl1[htl.index('LeftFlank')]) >= int(coord[1]) and int(tl1[htl.index('RightFlank')]) <= int(coord[2]) ): tl1[htl.index('in_cnv_regions')]='+' if (tl2[htl.index('Chr')]==coord[0] and int(tl2[htl.index('LeftFlank')]) >= int(coord[1]) and int(tl2[htl.index('RightFlank')]) <= int(coord[2]) ): tl2[htl.index('in_cnv_regions')]='+' penaltyscore1 = penalty_score(tl1,htl,cdnegfile,'compound-heterozygous', project_code) tl1[htl.index('varscore')] = str(penaltyscore1[0]) penaltyscore2 = penalty_score(tl2,htl,cdnegfile,'compound-heterozygous', project_code) tl2[htl.index('varscore')]= str(penaltyscore2[0]) if 'CDPred_score' in htl: if tl1[htl.index('CDPred_score')]=='-': tl1[htl.index('CDPred_score')]= '301' if tl2[htl.index('CDPred_score')]=='-': tl2[htl.index('CDPred_score')]= '301' linestosort.append( [(float(tl1[htl.index('varscore')]) + float(tl2[htl.index('varscore')])), min( int(tl1[htl.index('CDPred_score')]) , int(tl2[htl.index('CDPred_score')]) ), tl1, tl2]) elif not ('CDPred_score' in htl) and args.nextgene_data: linestosort.append( [(float(tl1[htl.index('varscore')]) + float(tl2[htl.index('varscore')])), '301', tl1, tl2]) else: raise Exception linespass=0 if i % 1000 == 0: sys.stderr.write("found a pair: progress in file %s is %.5s %%\n"%(cdnegfile , i/numlines *100)) sys.stderr.write("elapsed time = %.5s min\n"%((time.time() - start)/60)) #print 'linestosort',linestosort #linespass=0 #reset linepass # for kayit in linestosort: # resultdict = mydb.check_and_add_db(htl=htl, # variantline1 = kayit[2], # variantline2 = kayit[3], # project_code = project_code, # mendtype = 'compound_heterozygote' # ) # fiof_penalty=0 # if resultdict['founddifferentflag'] == True: # #print 'found same variant pair in different family==>',resultdict['founddifferentrow'] # #print 'lines in this family are line1=',tl1 # #print 'lines in this family are line2=',tl2 # for result in resultdict['founddifferentrow']: # kayit[2][htl.index('FIOFWSMIP')]+='%s,'%result[1] # kayit[3][htl.index('FIOFWSMIP')]+='%s,'%result[1] # fiof_penalty-=1000 # kayit[0]+=fiof_penalty sorted_lines = sorted(linestosort, key=lambda item:float(item[0]), reverse=True) for item in sorted_lines: item[2][htl.index('varscore')] = str(item[0]) item[3][htl.index('varscore')] = str(item[0]) com_het_res.append("\t".join(item[2])+"\n") com_het_res.append("\t".join(item[3])+"\n") com_het_res_path='%s/%s.com_het_res'%(os.path.dirname(cdnegfile),os.path.basename(cdnegfile)) write_lines(com_het_res_path,com_het_res) print 'Mendelian compound het analysis output:{}'.format(com_het_res_path) # write file stats fstats=open("%s/analysis_stats.txt"%os.path.dirname(cdnegfile),"a") fstats.write("The number of pairs for compound heterozygous variants are:\t%s "%((len(com_het_res)-1)/2) +"\n" ) end=time.time() print "total function elapsed time=",end-start #removes the temp directory created during execution tmpdir='%s/%s_tmp'%(os.path.dirname(cdnegfile),os.path.basename(cdnegfile)) if os.path.exists(tmpdir): print 'don\'t forget to delete the directory by enabling the rmtree function' #shutil.rmtree(tmpdir) def annotate_seattle_seq_snv(inputfile, outputfile): """ SeattleSEQ ANNOTATIONS --- inDBSNPOrNot (whether the variation is in the dbSNP database and-or 1000 Genomes; either echoing the user input in the case of a Maq file, or with no user input, whether represented in these datasets) --- chromosome (input from the user) --- position (input from user, location on the chromosome, hg19, 1-based) --- referenceBase (input from the user or calculated if not provided) --- sampleGenotype (input from the user) --- accession (NCBI or CCDS transcript identifier) --- functionGVS (GVS class of variation function, using only hg19 and your submitted alleles; see description) --- functionDBSNP (dbSNP class of variation function) --- rsID (dbSNP identifier for the variation) --- aminoAcids (list of amino acids for the codon, starting with that of the reference base) --- proteinPosition (the position of the amino acid in the protein, beginning at the N-terminal with the first amino acid at position 1, followed by the total number of amino acids in the protein; the total includes a count for the stop codon) --- Alleles Submitted (column sampleAlleles: same information as sampleGenotype, with the ambiguity code resolved into two alleles; if there are multiple individuals, this is a list of all alleles present) --- Genotype in dbSNP (column dbSNPGenotype: the genotype in dbSNP for a particular individual; N/N if not available, X/X if conflicting measurements; this column is available only if a numerical dbSNP individual ID is entered on the query form) --- Alleles in dbSNP (column allelesDBSNP: list of alleles for all populations and individuals in the GVS database; if the variation is in dbSNP, but has no genotypes for individuals, will be "NA") --- Conservation Score phastCons (column scorePhastCons: UCSC, 46 placental mammalian species, range of 0 to 1, with 1 being the most conserved) --- Conservation Score GERP (column consScoreGERP: rejected-substitution score from the program GERP, Stanford University, 34 mammalian species, range of -11.6 to 5.82, with 5.82 being the most conserved) --- Chimp Allele (column chimpAllele: UCSC alignments) --- Copy Number Variations (column CNV: Toronto database) --- Genes (column geneList: HUGO names, any for which the transcription region overlaps the variation) --- HapMap Frequencies (3 columns AfricanHapMapFreq, EuropeanHapMapFreq, AsianHapMapFreq: African, European, and Asian, in percent) --- Has Genotypes (column hasGenotypes: whether GVS, and thus usually dbSNP, has genotypes available for the variation) --- dbSNP Validation (column dbSNPValidation: dbSNP validation status codes, dealing with e.g. whether the variation has been seen at least twice) --- Repeats (2 columns repeatMasker and tandemRepeat: NA if the variation is not in the GVS database) --- Grantham Score (column granthamScore: the Grantham score of any amino acid changes, as per Grantham (1974) Science, Table 2) --- microRNAs (column microRNAs: EMBL IDs of any microRNAs in which the variation is found) --- Protein Sequence (column proteinSequence: sequence of amino acids for the overlapping transcript if applicable) --- PolyPhen Prediction (column polyPhen: amino acid substitution impacts, more information below) --- Clinical Association (column clinicalAssociation: links to NCBI pages, more information below) --- Distance to Nearest Splice Site (column distanceToSplice: how close the variation is to a splice site, more information below) --- cDNA Position (column cDNAPosition: the location within the sequence of coding bases; NA if not coding; so far only SNPs) assert headerindel1==headerindel2, "The two files do not have the same headers,please use the same settings for seattleseq output" """ IUB_codes={'A':'A','G':'G','C':'C','T':'T', 'R':'AG','Y':'CT','K':'GT','M':'AC', 'S':'GC','W':'AT','N':'N'} fw=open(outputfile, 'w') import gzip myfile = open(inputfile) #print 'Input file is %s lines'%len(inlines) sys.stderr.write('Seattle_seq annotation started for %s\n'%outputfile) htl = myfile.readline().strip().split('\t') ss_dict = {} ss_file_flag=0 for file_ in glob.glob(os.path.join(mycfg.seattleseq_dir_path,'*')): if 'SeattleSeqAnnotation' in file_ and 'ss_snv' in file_: ss_file_flag+=1 ss_lines=gzip.open(file_,'rb').readlines() shtl=ss_lines[0].strip().split('\t') shtl.reverse()#so the columns appear in correct order in the final file for item in shtl: if not 'ss_%s'%item in htl: htl.insert(0,'ss_{0}'.format(item)) shtl.reverse()#take the list back to it's normal order fw.write('\t'.join(htl)+'\n') for ss_line in ss_lines[1:]: stl = ss_line.strip().split('\t') if (not ss_line.startswith('#')): mykey = (stl[shtl.index('chromosome')],stl[shtl.index('position')]) ss_dict[mykey] = {} for item in shtl: ss_dict[mykey][item] = stl[shtl.index(item)] for line in myfile: tl = line.strip().split('\t') for i in range(len(htl)-len(tl)): tl.insert(0,'-') if tl[htl.index('hg19LeftFlank')] != '-': mylookupkey = (tl[htl.index('Chr')].replace('chr',''),str(int(tl[htl.index('hg19LeftFlank')])+1)) #print mylookupkey #raise Exception if ss_dict.get(mylookupkey): for item in shtl: tl[htl.index('ss_{0}'.format(item))] = ss_dict.get(mylookupkey)[item] if 'refseq' in htl: if tl[htl.index('refseq')]=='NA' and ss_dict.get(mylookupkey)['geneList']!='none': tl[htl.index('refseq')]= ss_dict.get(mylookupkey)['geneList'] fw.write('\t'.join(tl)+'\n') fw.close() if not ss_file_flag: shutil.copyfile(inputfile, outputfile) sys.stderr.write("""SeattleSeq annotation file was not found in the input directory continuing without seattle seq annotations """) elif ss_file_flag > 1: sys.stderr.write("""More than one SeattleSeq annotation file was found in the input directory please check the files and make sure the annotation files are correct""") def annotate_seattle_seq_indel(inputfile , outputfile): IUB_codes={'A':'A','G':'G','C':'C','T':'T', 'R':'AG','Y':'CT','K':'GT','M':'AC', 'S':'GC','W':'AT','N':'N'} fw=open(outputfile,'w') import gzip myfile = open(inputfile) #print 'Input file is %s lines'%len(inlines) htl = myfile.readline().strip().split('\t') ss_file_flag=0 ss_dict = {} for file_ in glob.glob(os.path.join(mycfg.seattleseq_dir_path,'*')): if 'SeattleSeqAnnotation' in file_ and 'ss_indel' in file_: ss_file_flag=1 ss_lines=gzip.open(file_,'rb').readlines() #shtl=ss_lines[0].lstrip('# ').strip().split('\t') shtl=ss_lines[0].strip().split('\t') shtl.reverse()#so the columns appear in correct order in the final file for item in shtl: if not 'ss_%s'%item in htl: htl.insert(0,'ss_{0}'.format(item)) shtl.reverse()#take the list back to it's normal order fw.write('\t'.join(htl)+'\n') for ss_line in ss_lines[1:]: stl=ss_line.strip().split('\t') if not ss_line.startswith('#'): mykey = (stl[shtl.index('chromosome')],stl[shtl.index('position')]) ss_dict[mykey] = {} for item in shtl: ss_dict[mykey][item] = stl[shtl.index(item)] for line in myfile: tl = line.strip().split('\t') for i in range(len(htl)-len(tl)): tl.insert(0,'-') if tl[htl.index('hg19LeftFlank')] != '-': mylookupkey = (tl[htl.index('Chr')].replace('chr',''),tl[htl.index('hg19LeftFlank')]) if ss_dict.get(mylookupkey): for item in shtl: tl[htl.index('ss_{0}'.format(item))] = ss_dict.get(mylookupkey)[item] if 'refseq' in htl: if tl[htl.index('refseq')]=='NA' and ss_dict.get(mylookupkey)['geneList']!='none': tl[htl.index('refseq')]= ss_dict.get(mylookupkey)['geneList'] fw.write('\t'.join(tl)+'\n') fw.close() if not ss_file_flag: shutil.copyfile(inputfile ,outputfile) sys.stderr.write("""SeattleSeq annotation file was not found in the input directory continuing without seattle seq annotations """) def annotate_with_dbsnp(file_, output_file): """ works on myutils.coords19 BASED varsifter files that has hg19 annotations adds dbsnp137 data to the file """ sys.stderr.write('NOTICE: varmd(annotate_with_dbsnp) started\n') test=0 verbose=0 if verbose: print 'file',file_ try: expression_snps = [x.strip() for x in open(mycfg.expression_snps_path).readlines()] except Exception: raise Exception headers_to_add =[ 'Homozygous_VarAllele_UDP_count', 'Single_VarAllele_UDP_count', 'all_mpg:coverage_score', 'dbsnp137G5', 'dbsnp137ClinicalSignificance', 'dbsnp137GMAF', 'dbsnp137RS#', 'dbsnp137', 'predicted_expression_snp', 'varscore', ] fwlog=open('%s_log'%output_file,'w') myfile1 = open(file_) htl = myfile1.readline().strip().split('\t') file_already_has_dbsnp=True for htl_ in headers_to_add: if htl_ not in htl: file_already_has_dbsnp = False if file_already_has_dbsnp: os.system('cp %s %s'%(file_, output_file)) fwlog.write('\n\nFile already has dbsnp data, please continue analysis\n') elif not file_already_has_dbsnp: myfile1 = open(file_) if not 'hg19LeftFlank' in htl: print 'hg19LeftFlank is not in htl' print 'htl :', htl raise Exception fw = open(output_file,'w') fwlog = open('%s_log'%output_file,'w') writtenlinescounter=0 counter=0 #time.sleep(10) htl = myfile1.readline().strip().split('\t') for htl_ in headers_to_add: htl.insert(0,htl_) hline='\t'.join(htl)+'\n' fw.write(hline) linescounter=0 vcfadded=0 mykg_table = Table('kgdict', metadata, autoload=True) chrindex = htl.index('Chr') hg19LeftFlankIndex = htl.index('hg19LeftFlank') for chr in chrs: print chr myselect = mykg_table.select().where(mykg_table.c.Chr == chr.replace('chr','')) #rs = myselect.execute().fetchall() rs = myselect.execute() mydict = {} for x in rs: mydict[(x[1],str(x[2]-1))]= x[3:] myfile1 = open(file_) myfile1.readline() chrlinecounter=0 for line in myfile1: #if chrlinecounter > 20000: # break line = unicode(line, 'utf-8') tl=line.strip().split('\t') tl.insert(htl.index('varscore'),'0') tl.insert(1,'-') tl.insert(2,'-') tl.insert(3,'-') tl.insert(4,'-') tl.insert(5,'-') tl.insert(6,'-') tl.insert(7,'-') tl.insert(8,'-') tl.insert(9,'-') if test: print tl time.sleep(1) vcffound=0 counter+=1 #verbose=True if verbose and (counter%10)==0: print 'progress is %f' %counter #if tl[hg19LeftFlankIndex] in ['13301', '13326', '13979']: # print "\ntest case" # print 'tl',tl[:20] # print 'mydict',mydict.get((tl[chrindex].replace('chr',''),tl[hg19LeftFlankIndex])) if tl[chrindex]== chr: if mydict.get((tl[chrindex].replace('chr',''), tl[hg19LeftFlankIndex])): vcffound+=1 gmafscore='' scsclass='' mysnpdata = mydict.get((tl[chrindex].replace('chr',''), tl[hg19LeftFlankIndex])) kgvcftemp = mysnpdata[5].split(';') snp_name = mysnpdata[0] #print 'original dict entry', mydict.get((tl[chrindex].replace('chr',''), tl[hg19LeftFlankIndex])) #print 'kgvcftem', kgvcftemp #print 'snp_name', snp_name #time.sleep(1) for item in kgvcftemp: if item.startswith('VP'): kgvcftemp.remove(item) if item=='G5': tl[htl.index('dbsnp137G5')]='+' #">5% minor allele frequency in 1+ populations" kgvcftemp.remove(item) elif item.startswith('GMAF'): gmafscore = item.split('=')[1] kgvcftemp.remove(item) elif item.startswith('SCS'): scs = item.split('=')[1] if scs=='0': scsclass='unknown' elif scs=='1': scsclass='untested' elif scs=='2': scsclass='non-pathogenic' elif scs=='3': scsclass='probable-non-pathogenic' elif scs=='4': scsclass='probable-pathogenic' elif scs=='5': scsclass='pathogenic' elif scs=='6': scsclass='drug-response' elif scs=='7': scsclass='histocompatibility' elif scs=='255': scsclass='other' kgvcftemp.remove(item) #print 'final kgvcftemp',kgvcftemp #time.sleep(10) tl[htl.index('dbsnp137')]=','.join(kgvcftemp) tl[htl.index('dbsnp137RS#')] = snp_name if snp_name in expression_snps: #adds snps that are found in expression modifier regions tl[htl.index('predicted_expression_snp')] = '+' tl[htl.index('dbsnp137GMAF')] = gmafscore #"Global Minor Allele \ #Frequency [0, 0.5]; global population is 1000Genomes phase 1 genotype data from 629 individuals, released in the 08-04-2010 dataset" tl[htl.index('dbsnp137ClinicalSignificance')] = scsclass vcfadded+=1 try: newline = '\t'.join(tl)+'\n' newline = newline.encode('ascii', 'ignore') fw.write(newline) except UnicodeDecodeError as e: print "there was an error with decoding the line content" print tl print e raise Exception if vcfadded: fwlog.write('\n\nOut of a total of %s lines, %s lines were annotated with dbsnp137 annotation \n'%(linescounter,vcfadded)) fw.close() endtime=time.time() duration=((endtime-startall)/60) fwlog.write('print "it took me %.2f minutes to complete the operation' %duration) fwlog.close() #send mail with primitive mail function that works in biowulf where smtp doesn't work. #if int(duration)>180: # sendmail() def annotate_with_ESP(file_, output_file): """ works on myutils.coords19 BASED varsifter files that has hg19 annotations adds ESP_variants from mysql database to the VAR-MD file """ sys.stderr.write('NOTICE: varmd(annotate_with_ESP_variants) started\n') test=0 verbose=0 if verbose: print 'file',file_ headers_to_add =[ 'ESP_info', 'ESP_TAC', 'ESP_AA_AC', 'ESP_EA_AC', 'ESP_alt', 'ESP_ref', ] fwlog=open('%s_log'%output_file,'w') myfile1 = open(file_) htl = myfile1.readline().strip().split('\t') file_already_has_annotations = True for htl_ in headers_to_add: if htl_ not in htl: file_already_has_annotations = False if file_already_has_annotations: os.system('cp %s %s'%(file_, output_file)) fwlog.write('\n\nFile already has ESP annotations , please continue analysis\n') elif not file_already_has_annotations: myfile1 = open(file_) if not 'hg19LeftFlank' in htl: print 'hg19LeftFlank is not in htl' print 'htl :', htl raise Exception fw = open(output_file,'w') fwlog = open('%s_log'%output_file,'w') writtenlinescounter=0 counter=0 #time.sleep(10) htl = myfile1.readline().strip().split('\t') for htl_ in headers_to_add: htl.insert(0,htl_) hline='\t'.join(htl)+'\n' fw.write(hline) linescounter=0 vcfadded=0 my_table = Table('ESP_variants', metadata, autoload=True) chrindex = htl.index('Chr') hg19LeftFlankIndex = htl.index('hg19LeftFlank') for chr in chrs: print chr myselect = my_table.select().where(my_table.c.Chr == chr.replace('chr','')) #rs = myselect.execute().fetchall() rs = myselect.execute() mydict = {} for x in rs: mydict[(x[1],str(x[2]-1))]= x[3:] myfile1 = open(file_) myfile1.readline() chrlinecounter=0 for line in myfile1: #if chrlinecounter > 20000: # break line = unicode(line, 'utf-8') tl=line.strip().split('\t') tl.insert(0,'-') tl.insert(0,'-') tl.insert(0,'-') tl.insert(0,'-') tl.insert(0,'-') tl.insert(0,'-') if test: print tl time.sleep(1) vcffound=0 counter+=1 #verbose=True if verbose and (counter%10)==0: print 'progress is %f' %counter #if tl[hg19LeftFlankIndex] in ['13301', '13326', '13979']: # print "\ntest case" # print 'tl',tl[:20] # print 'mydict',mydict.get((tl[chrindex].replace('chr',''),tl[hg19LeftFlankIndex])) if tl[chrindex]== chr: if mydict.get((tl[chrindex].replace('chr',''), tl[hg19LeftFlankIndex])): vcffound+=1 gmafscore='' scsclass='' myresult = mydict.get((tl[chrindex].replace('chr',''), tl[hg19LeftFlankIndex])) kgvcftemp = myresult[9].split(';') #print 'original dict entry', mydict.get((tl[chrindex].replace('chr',''), tl[hg19LeftFlankIndex])) #print 'kgvcftem', kgvcftemp #print 'snp_name', snp_name #time.sleep(1) #print 'final kgvcftemp',kgvcftemp #time.sleep(10) tl[htl.index('ESP_ref')]=myresult[1] tl[htl.index('ESP_alt')]=myresult[2] tl[htl.index('ESP_EA_AC')]=myresult[6] tl[htl.index('ESP_AA_AC')]=myresult[7] tl[htl.index('ESP_TAC')]=myresult[8] tl[htl.index('ESP_info')]= ';'.join(kgvcftemp) vcfadded+=1 try: newline = '\t'.join(tl)+'\n' newline = newline.encode('ascii', 'ignore') fw.write(newline) except UnicodeDecodeError as e: print "there was an error with decoding the line content" print tl print e raise Exception if vcfadded: fwlog.write('\n\nOut of a total of %s lines, %s lines were annotated with dbsnp137 annotation \n'%(linescounter,vcfadded)) fw.close() endtime=time.time() duration=((endtime-startall)/60) fwlog.write('print "it took me %.2f minutes to complete the operation' %duration) fwlog.close() #send mail with primitive mail function that works in biowulf where smtp doesn't work. #if int(duration)>180: # sendmail() def annotate_with_allfams(myinfile,myoutfile): """INPUT: 1000 genomes/dbsnp annotated files adds genotype frequency data from UDP cohort allsamples file """ sys.stderr.write('varmd (annotate_with_allfams) started for %s \n'%myinfile) myfile1 = open(myinfile) fw=open(myoutfile,'w') fwlog=open('{0}_log'.format(myoutfile),'a') try: conn = sqlite3.connect(mycfg.population_db_path) c=conn.cursor() except Exception, e: print """WARNING: There was a problem with opening the population database Please refer to annotate_with_allfams """ print 'the exception is',e hline=myfile1.readline() htl= hline.rstrip().split('\t') fw.write(hline) fwlog.write('Positions where no corresponding data from population file could be found\n') fwlog.write('Index\t\tChr\t\thg18LeftFlank\t\thg19LEftFlank\n') linecounter=0 for line in myfile1: linecounter+=1 found_unique_flag=0 found_multiple_flag=0 tl=line.strip().split('\t') mykey=(tl[htl.index('Chr')], tl[htl.index('hg19LeftFlank')], tl[htl.index('hg19RightFlank')], #tl[htl.index('muttype')], tl[htl.index('var_allele')]) rc = c.execute(''' select * from popdb where chr=? and hg19LeftFlank=? and hg19RightFlank=? and var_allele=?''',mykey) rs = rc.fetchall() if not rs: try: leftflankpos = tl[htl.index('LeftFlank')] except: leftflankpos = "None" fwlog.write('%s\n'%'\t'.join([tl[htl.index('Index')], tl[htl.index('Chr')], leftflankpos, tl[htl.index('hg19LeftFlank')]]) ) fw.write('\t'.join(tl)+'\n') elif len(rs)==1: #for x in rs[0]: # print x, rs[0].index(x) #raise Exception #print rs[0][5].split(';;') tafl = {x.split('::')[0]:x.split('::')[1] for x in rs[0][5].split(';;')} tl[htl.index('Homozygous_VarAllele_UDP_count')]='{0}'.format(tafl['var_gt_count'].rjust(4)) tl[htl.index('Single_VarAllele_UDP_count')]='{0}'.format(tafl['var_allele_count'].rjust(4)) tl[htl.index('all_mpg:coverage_score')]='{0}'.format(tafl['all_mpg:coverage_score'].rjust(4)) found_unique_flag=1 fw.write('\t'.join(tl)+'\n') else: print '%s records found'%len(rs) print rs[0][:5] print rs[1][:5] tafl = {x.split('::')[0]:x.split('::')[1] for x in rs[0][5].split(';;')} tl[htl.index('Homozygous_VarAllele_UDP_count')]='{0}'.format(tafl['var_gt_count'].rjust(4)) tl[htl.index('Single_VarAllele_UDP_count')]='{0}'.format(tafl['var_allele_count'].rjust(4)) tl[htl.index('all_mpg:coverage_score')]='{0}'.format(tafl['all_mpg:coverage_score'].rjust(4)) found_multiple_flag=1 fw.write('\t'.join(tl)+'\n') if linecounter%20000==0: print linecounter endtime=time.time() duration=((endtime-startall)/60) fwlog.write('print "it took me %.2f minutes to complete the operation' %duration) fw.close() fwlog.close() def annotate_with_allfams_mysql(myinfile,myoutfile): """INPUT: dbsnp annotated file adds genotype frequency data from UDP cohort allsamples database in mysql """ sys.stderr.write('varmd (annotate_with_allfams) started for %s \n'%myinfile) myfile1 = open(myinfile) fw=open(myoutfile,'w') fwlog=open('{0}_log'.format(myoutfile),'a') try: conn = sqlite3.connect(mycfg.population_db_path) c=conn.cursor() except Exception, e: print """WARNING: There was a problem with opening the population database Please refer to annotate_with_allfams """ print 'the exception is',e hline=myfile1.readline() htl= hline.rstrip().split('\t') fw.write(hline) fwlog.write('Positions where no corresponding data from population file could be found\n') fwlog.write('Index\t\tChr\t\thg18LeftFlank\t\thg19LEftFlank\n') linecounter=0 for line in myfile1: linecounter+=1 found_unique_flag=0 found_multiple_flag=0 tl=line.strip().split('\t') mykey=(tl[htl.index('Chr')], tl[htl.index('hg19LeftFlank')], tl[htl.index('hg19RightFlank')], #tl[htl.index('muttype')], tl[htl.index('var_allele')]) rc = c.execute(''' select * from popdb where chr=? and hg19LeftFlank=? and hg19RightFlank=? and var_allele=?''',mykey) rs = rc.fetchall() if not rs: try: leftflankpos = tl[htl.index('LeftFlank')] except: leftflankpos = "None" fwlog.write('%s\n'%'\t'.join([tl[htl.index('Index')], tl[htl.index('Chr')], leftflankpos, tl[htl.index('hg19LeftFlank')]]) ) fw.write('\t'.join(tl)+'\n') elif len(rs)==1: #for x in rs[0]: # print x, rs[0].index(x) #raise Exception #print rs[0][5].split(';;') tafl = {x.split('::')[0]:x.split('::')[1] for x in rs[0][5].split(';;')} tl[htl.index('Homozygous_VarAllele_UDP_count')]='{0}'.format(tafl['var_gt_count'].rjust(4)) tl[htl.index('Single_VarAllele_UDP_count')]='{0}'.format(tafl['var_allele_count'].rjust(4)) tl[htl.index('all_mpg:coverage_score')]='{0}'.format(tafl['all_mpg:coverage_score'].rjust(4)) found_unique_flag=1 fw.write('\t'.join(tl)+'\n') else: print '%s records found'%len(rs) print rs[0][:5] print rs[1][:5] tafl = {x.split('::')[0]:x.split('::')[1] for x in rs[0][5].split(';;')} tl[htl.index('Homozygous_VarAllele_UDP_count')]='{0}'.format(tafl['var_gt_count'].rjust(4)) tl[htl.index('Single_VarAllele_UDP_count')]='{0}'.format(tafl['var_allele_count'].rjust(4)) tl[htl.index('all_mpg:coverage_score')]='{0}'.format(tafl['all_mpg:coverage_score'].rjust(4)) found_multiple_flag=1 fw.write('\t'.join(tl)+'\n') if linecounter%20000==0: print linecounter endtime=time.time() duration=((endtime-startall)/60) fwlog.write('print "it took me %.2f minutes to complete the operation' %duration) fw.close() fwlog.close() def create_allsamples_db(): """ Generates a sqlite3 database using the data in the Allsamples file the path to input data comes from config.population_file The path to database file comes from configuration.population_db_path """ myfile = varfile(open(mycfg.population_file).readline().strip().split('\t') ) patient_pos=myfile.first_sample_index fr = open(mycfg.population_file) headerline = fr.readline() htl = headerline.strip().split('\t') htl.insert(0,'var_allele_count') htl.insert(0,'ref_allele_count') htl.insert(0,'var_gt_count') htl.insert(0,'ref_gt_count') htl.insert(0,'all_mpg:coverage_score') qsubscratchpath='/scratch/varmd/' scratchflag = 0 if os.path.exists(qsubscratchpath): mydatabasepath = os.path.join(qsubscratchpath ,'population_sqlite3.db') scratchflag = 1 else: mydatabasepath = mycfg.population_db_path conn = sqlite3.connect(mydatabasepath) print 'database is opened at %s'%mydatabasepath c=conn.cursor() c.execute(''' create table IF NOT EXISTS popdb (chr INTEGER, hg19LeftFlank INTEGER, hg19RightFlank INTEGER, muttype, var_allele, data, var_allele_count, ref_allele_count, var_gt_count, ref_gt_count, all_mpg_coverage_score )''') linecounter=0 duplicates=0 fr = open(mycfg.population_file) fr.readline() for line in fr: line = unicode(line, 'utf-8') linecounter+=1 #print line;time.sleep(0.5) tl=line.strip().split('\t') tl.insert(0,'-') tl.insert(0,'-') tl.insert(0,'-') tl.insert(0,'-') tl.insert(0,'-') newline = varline(tl,htl) if not (tl[ htl.index('hg19LeftFlank')].isdigit() and tl[ htl.index('hg19RightFlank')].isdigit()): pass else: tl[htl.index('var_allele_count')]=str(newline.var_allele_count) tl[htl.index('ref_allele_count')]=str(newline.ref_allele_count) tl[htl.index('var_gt_count')]=str(newline.var_gt_count) tl[htl.index('ref_gt_count')]=str(newline.ref_gt_count) tl[htl.index('all_mpg:coverage_score')]=str(newline.mpg_coverage_score) #DO NOT USE REFSEQ as part of the key as it creates errors #when the same position is annotated with different gene names # there are a few indel based multiple variants with this key combination #this is currently not handled by var-md #work on this later on myentry = ['%s::%s'%(htl[i],tl[i]) for i in range(len(tl))] #for each item in the line add column header as key and item as value c.execute('''insert into popdb values(?,?,?,?,?,?,?,?,?,?,?)''',( tl[htl.index('Chr')], tl[htl.index('hg19LeftFlank')], tl[htl.index('hg19RightFlank')], tl[htl.index('muttype')], tl[htl.index('var_allele')], ';;'.join(myentry), str(newline.var_allele_count), str(newline.ref_allele_count), str(newline.var_gt_count), str(newline.ref_gt_count), str(newline.mpg_coverage_score), )) if linecounter%10000 == 0: conn.commit() #it is very important to commit regularly, if you don't commit #every so often the roll back journal gets too big and halts the script and brings it to its knees if linecounter%30000 == 0: print linecounter #result=c.execute('select * from kgdict limit 10''') #print result.fetchone() #time.sleep(2) print 'database created, now creating index' c.execute('CREATE INDEX idx_popdb ON popdb(chr,hg19LeftFlank,hg19RightFlank)') conn.commit() c.close() print ('database created') print 'finished processing %s'%mycfg.population_file print 'Total %s lines had duplicates'%duplicates print 'new files can be found under %s directory'%os.path.dirname(mycfg.population_file) if scratchflag ==1: os.system('cp %s %s'%(mydatabasepath, mycfg.population_db_path)) os.system('rm %s'%(mydatabasepath)) sendmail() def create_allsamples_db_mysql(): """ Generates a database using the data in the Allsamples file the path to input data comes from config.population_file The path to database file comes from configuration.population_db_path """ #print 'please edit the table name for a new input file' #raise Exception popdb_table = Table( 'popdb_table_06_02',metadata, Column('myindex',Integer,primary_key=True), Column('NISC_Index',Integer,nullable=False), Column('Chr',String(10),nullable=False), Column('LeftFlank',Integer,nullable=False), Column('RightFlank',Integer,nullable=False), Column('hg19LeftFlank',Integer,nullable=False), Column('hg19RightFlank',Integer,nullable=False), Column('ref_allele',String(300),nullable=False), Column('var_allele',String(300),nullable=False), Column('muttype',String(30),nullable=False), Column('variant_type',String(40),nullable=False), Column('var_allele_count',Integer,nullable=False), Column('ref_allele_count',Integer,nullable=False), Column('var_gt_count',Integer,nullable=False), Column('ref_gt_count',Integer,nullable=False), Column('all_mpg_coverage_score',Integer,nullable=False), Column('Gene_name',String(500),nullable=False), Column('loc_type',String(100),nullable=False), Column('consequence',String(1000),nullable=False), Column('All_Gahlc_genotypes_NISC',Integer,nullable=False), Column('All_Gahlc_homref_NISC',Integer,nullable=False), Column('All_Gahlc_het_NISC',Integer,nullable=False), Column('All_Gahlc_homvar_NISC',Integer,nullable=False), Column('All_Gahlc_hetnonref_NISC',Integer,nullable=False), Column('All_Gahlc_other_NISC',Integer,nullable=False), Column('All_Gahlc_refallele_NISC',Integer,nullable=False), Column('All_Gahlc_varallele_NISC',Integer,nullable=False), Column('All_Gahlfreq_homref_NISC',Float(precision=4),nullable=False), Column('All_Gahlfreq_het_NISC',Float(precision=4),nullable=False), Column('All_Gahlfreq_homvar_NISC',Float(precision=4),nullable=False), Column('All_Gahlfreq_hetnonref_NISC',Float(precision=4),nullable=False), Column('All_Gahlfreq_refallele_NISC',Float(precision=4),nullable=False), Column('All_Gahlfreq_varallele_NISC',Float(precision=4),nullable=False), Column('All_Gahlref_is_minor_NISC',Integer ,nullable=False), Column('All_Gahlc_minor_NISC', Integer ,nullable=False), Column('All_Gahlmaf_NISC',Float(precision=4),nullable=False), Column('All_Gahlchisquare_NISC',Float(precision=4),nullable=False), Column('CS_c_genotypes_NISC', Integer ,nullable=False), Column('CS_c_homref_NISC', Integer ,nullable=False), Column('CS_c_het_NISC', Integer ,nullable=False), Column('CS_c_homvar_NISC', Integer ,nullable=False), Column('CS_c_hetnonref_NISC', Integer ,nullable=False), Column('CS_c_other_NISC', Integer ,nullable=False), Column('CS_c_refallele_NISC', Integer ,nullable=False), Column('CS_c_varallele_NISC', Integer ,nullable=False), Column('CS_freq_homref_NISC', Float(precision=4) ,nullable=False), Column('CS_freq_het_NISC', Float(precision=4) ,nullable=False), Column('CS_freq_homvar_NISC', Float(precision=4) ,nullable=False), Column('CS_freq_hetnonref_NISC', Float(precision=4) ,nullable=False), Column('CS_freq_refallele_NISC', Float(precision=4) ,nullable=False), Column('CS_freq_varallele_NISC', Float(precision=4) ,nullable=False), Column('CS_ref_is_minor_NISC', Integer ,nullable=False), Column('CS_c_major_NISC', Integer ,nullable=False), Column('CS_c_minor_NISC', Integer ,nullable=False), Column('CS_maf_NISC', Float(precision=4) ,nullable=False), Column('CS_chisquare_NISC', Float(precision=4) ,nullable=False), Column('transcript', String(100) ,nullable=False), Column('ref_aa', String(100) ,nullable=False), Column('var_aa', String(100) ,nullable=False), Column('aa_pos', String(100) ,nullable=False), Column('CDPred_score', String(100) ,nullable=False), Column('RS_number', String(100) ,nullable=False), Column('HGMDids', String(100) ,nullable=False), Column('HGMDdisease', String(300) ,nullable=False), Column('HGMDtags', String(100) ,nullable=False), Column('HGMD_HGVS', String(100) ,nullable=False), Column('REF_MATCHES_HGMD_DISEASE_ALLELE', String(100) ,nullable=False), Column('HGMD_ALLELES_in_GENOME_STRAND', String(100) ,nullable=False), Column('HGMDinGene', String(100) ,nullable=False), Column('K1_total_depth_at_site', String(100) ,nullable=False), Column('K1_rms_mapping_quality', String(100) ,nullable=False), Column('K1_num_samples_with_covg', String(100) ,nullable=False), Column('K1_num_alleles_in_samples_with_covg', String(100) ,nullable=False), Column('K1_alt_allele_cnts_in_samples_with_covg', String(100) ,nullable=False), Column('K1_alt_allele_freqs', String(100) ,nullable=False), Column('K1_allele_balance_in_hets', String(100) ,nullable=False), #Column('data',String(50000),nullable=False), ) all_genotypes_table = Table( 'all_genotypes_table_06_02',metadata, Column('myindex',Integer,primary_key=True), Column('varid',Integer, nullable=False), Column('sample_code',String(30),nullable=False), Column('GT',String(800),nullable=False), Column('mpg_score',String(10), nullable=True), Column('coverage', String(10) ,nullable = True ), ) Index('idx_popdb', popdb_table.c.Chr, popdb_table.c.hg19LeftFlank, popdb_table.c.hg19RightFlank) Index('idx_gt', all_genotypes_table.c.varid) metadata.create_all() my_s = popdb_table.select().order_by(popdb_table.c.myindex.desc()).limit(1) a = my_s.execute() last_index=0 for x in a: last_index = x[0] #print 'last_index',last_index break fr = open(mycfg.population_file) headerline = fr.readline() htl = headerline.strip().split('\t') htl.insert(0,'var_allele_count') htl.insert(0,'ref_allele_count') htl.insert(0,'var_gt_count') htl.insert(0,'ref_gt_count') htl.insert(0,'all_mpg:coverage_score') myfile = varfile(htl) patient_pos = myfile.first_sample_index linecounter=0 duplicates=0 fr = open(mycfg.population_file) fr.readline() for line in fr: line = unicode(line, 'utf-8') linecounter+=1 if linecounter <= last_index: continue #print line;time.sleep(0.5) tl=line.strip().split('\t') tl.insert(0,'-') tl.insert(0,'-') tl.insert(0,'-') tl.insert(0,'-') tl.insert(0,'-') newline = varline(tl,htl) if not (tl[ htl.index('hg19LeftFlank')].isdigit() and tl[ htl.index('hg19RightFlank')].isdigit()): pass else: tl[htl.index('var_allele_count')] = str(newline.var_allele_count) tl[htl.index('ref_allele_count')] = str(newline.ref_allele_count) tl[htl.index('var_gt_count')] = str(newline.var_gt_count) tl[htl.index('ref_gt_count')] = str(newline.ref_gt_count) tl[htl.index('all_mpg:coverage_score')] = str(newline.mpg_coverage_score) #DO NOT USE REFSEQ as part of the key as it creates errors #when the same position is annotated with different gene names # there are a few indel based multiple variants with this key combination #this is currently not handled by var-md #work on this later on last_index +=1 CS_c_genotypes = tl[htl.index('CS_801c_genotypes')] CS_c_homref = tl[htl.index('CS_801c_homref')] CS_c_het = tl[htl.index('CS_801c_het')] CS_c_homvar = tl[htl.index('CS_801c_homvar')] CS_c_hetnonref = tl[htl.index('CS_801c_hetnonref')] CS_c_other = tl[htl.index('CS_801c_other')] CS_c_refallele = tl[htl.index('CS_801c_refallele')] CS_c_varallele = tl[htl.index('CS_801c_varallele')] CS_freq_homref = tl[htl.index('CS_801freq_homref')] CS_freq_het = tl[htl.index('CS_801freq_het')] CS_freq_homvar = tl[htl.index('CS_801freq_homvar')] CS_freq_hetnonref = tl[htl.index('CS_801freq_hetnonref')] CS_freq_refallele = tl[htl.index('CS_801freq_refallele')] CS_freq_varallele = tl[htl.index('CS_801freq_varallele')] CS_ref_is_minor = tl[htl.index('CS_801ref_is_minor')] CS_c_major = tl[htl.index('CS_801c_major')] CS_c_minor = tl[htl.index('CS_801c_minor')] CS_maf = tl[htl.index('CS_801maf')] CS_chisquare = tl[htl.index('CS_801chisquare')] #myentry = ['%s::%s'%(htl[i],tl[i]) for i in range(76,len(tl))] #for each item in the line add column header as key and item as value ins = popdb_table.insert() ins.execute( myindex = last_index , NISC_Index = tl[htl.index('Index')], Chr = tl[htl.index('Chr')], LeftFlank = tl[htl.index('LeftFlank')], RightFlank = tl[htl.index('RightFlank')], hg19LeftFlank = tl[htl.index('hg19LeftFlank')], hg19RightFlank = tl[htl.index('hg19RightFlank')], ref_allele = tl[htl.index('ref_allele')], var_allele = tl[htl.index('var_allele')], muttype = tl[htl.index('muttype')], variant_type = tl[htl.index('type')], var_allele_count = newline.var_allele_count, ref_allele_count = newline.ref_allele_count, var_gt_count = newline.var_gt_count, ref_gt_count = newline.ref_gt_count, all_mpg_coverage_score = newline.mpg_coverage_score, Gene_name = tl[htl.index('Gene_name')], loc_type = tl[htl.index('loc_type')], consequence = tl[htl.index('consequence')], All_Gahlc_genotypes_NISC = tl[htl.index('All_Gahlc_genotypes')], All_Gahlc_homref_NISC = tl[htl.index('All_Gahlc_homref')], All_Gahlc_het_NISC = tl[htl.index('All_Gahlc_het')], All_Gahlc_homvar_NISC = tl[htl.index('All_Gahlc_homvar')], All_Gahlc_hetnonref_NISC = tl[htl.index('All_Gahlc_hetnonref')], All_Gahlc_other_NISC = tl[htl.index('All_Gahlc_other')], All_Gahlc_refallele_NISC = tl[htl.index('All_Gahlc_refallele')], All_Gahlc_varallele_NISC = tl[htl.index('All_Gahlc_varallele')], All_Gahlfreq_homref_NISC = tl[htl.index('All_Gahlfreq_homref')], All_Gahlfreq_het_NISC = tl[htl.index('All_Gahlfreq_het')], All_Gahlfreq_homvar_NISC = tl[htl.index('All_Gahlfreq_homvar')], All_Gahlfreq_hetnonref_NISC = tl[htl.index('All_Gahlfreq_hetnonref')], All_Gahlfreq_refallele_NISC = tl[htl.index('All_Gahlfreq_refallele')], All_Gahlfreq_varallele_NISC = tl[htl.index('All_Gahlfreq_varallele')], All_Gahlref_is_minor_NISC = tl[htl.index('All_Gahlref_is_minor')], All_Gahlc_minor_NISC = tl[htl.index('All_Gahlref_is_minor')], All_Gahlmaf_NISC = tl[htl.index('All_Gahlmaf')], All_Gahlchisquare_NISC = tl[htl.index('All_Gahlchisquare')], CS_c_genotypes_NISC = CS_c_genotypes, CS_c_homref_NISC = CS_c_homref, CS_c_het_NISC = CS_c_het , CS_c_homvar_NISC = CS_c_homvar , CS_c_hetnonref_NISC = CS_c_hetnonref , CS_c_other_NISC = CS_c_other , CS_c_refallele_NISC = CS_c_refallele , CS_c_varallele_NISC = CS_c_varallele , CS_freq_homref_NISC = CS_freq_homref , CS_freq_het_NISC = CS_freq_het , CS_freq_homvar_NISC = CS_freq_homvar , CS_freq_hetnonref_NISC = CS_freq_hetnonref , CS_freq_refallele_NISC = CS_freq_refallele , CS_freq_varallele_NISC = CS_freq_varallele , CS_ref_is_minor_NISC = CS_ref_is_minor , CS_c_major_NISC = CS_c_major , CS_c_minor_NISC = CS_c_minor , CS_maf_NISC = CS_maf , CS_chisquare_NISC = CS_chisquare , transcript = tl[htl.index('transcript')], ref_aa = tl[htl.index('ref_aa')], var_aa = tl[htl.index('var_aa')], aa_pos = tl[htl.index('aa_pos')], CDPred_score = tl[htl.index('CDPred_score')], RS_number = tl[htl.index('RS#')], HGMDids = tl[htl.index('HGMDids')], HGMDdisease = tl[htl.index('HGMDdisease')], HGMDtags = tl[htl.index('HGMDtags')], HGMD_HGVS = tl[htl.index('HGMD_HGVS')], REF_MATCHES_HGMD_DISEASE_ALLELE = tl[htl.index('REF_MATCHES_HGMD_DISEASE_ALLELE')], HGMD_ALLELES_in_GENOME_STRAND = tl[htl.index('HGMD_ALLELES_in_GENOME_STRAND')], HGMDinGene = tl[htl.index('HGMDinGene')], K1_total_depth_at_site = tl[htl.index('1K_total_depth_at_site')], K1_rms_mapping_quality = tl[htl.index('1K_rms_mapping_quality')], K1_num_samples_with_covg = tl[htl.index('1K_#samples_with_covg')], K1_num_alleles_in_samples_with_covg = tl[htl.index('1K_#alleles_in_samples_with_covg')], K1_alt_allele_cnts_in_samples_with_covg = tl[htl.index('1K_alt_allele_cnts_in_samples_with_covg')], K1_alt_allele_freqs = tl[htl.index('1K_alt_allele_freqs')], K1_allele_balance_in_hets = tl[htl.index('1K_allele_balance_in_hets')], #data = ';;'.join(myentry), ) #myentry = ['%s::%s'%(htl[i],tl[i]) for i in range(76,len(tl))] #print '='*40 #for i in range(76,len(tl)-2,3): for i in range(patient_pos, len(tl)-2,3): #print htl[i], tl[i:i+3] #time.sleep(0.1) ins = all_genotypes_table.insert() ins.execute( myindex = None, varid = last_index, sample_code = htl[i].replace('.NA','') , GT = tl[i] , mpg_score = tl[i+1] , coverage = tl[i+2] , ) if linecounter%30000 == 0: print linecounter #result=c.execute('select * from kgdict limit 10''') #print result.fetchone() #time.sleep(2) print 'finished processing %s'%mycfg.population_file print 'Total %s lines had duplicates'%duplicates print 'popdb database created' sendmail() def hg19_bin(chr,pos,coords19): i=0 for coord19 in coords19: if chr.lower().replace('chr','') == coord19[0].lower().replace('chr','') and int(coord19[1]) <= int(pos) and int(coord19[2]): return i break else: pass i+=1 def dbsnp_vcf2db(): """ Populates the mysql database with dbsnp data """ kgdict_table = Table( 'kgdict',metadata, Column('myindex',Integer,primary_key=True), Column('Chr',String(10),nullable=False), #Column('bin',Integer,nullable=False), Column('pos',Integer,nullable=False), Column('id',String(100),nullable=False), Column('ref',String(800),nullable=False), Column('alt',String(300),nullable=False), Column('qual',String(200),nullable=False), Column('filter',String(20),nullable=False), Column('info',String(200),nullable=False), ) Index('idx_kgdict', kgdict_table.c.Chr, kgdict_table.c.pos) metadata.create_all() #mydbpath = os.path.join('/scratch',os.path.basename(mycfg.dbsnp_db_path)) #conn = sqlite3.connect(mydbpath) #c=conn.cursor() #c.execute(''' #create table IF NOT EXISTS kgdict (chr INTEGER, # bin INTEGER, # pos INTEGER, # id TEXT, # ref TEXT, # alt TEXT, # qual TEXT, # filter TEXT, # info)''') linecounter=0 import gzip for line in gzip.open(mycfg.dbsnp_vcf_path): linecounter+=1 if not line.startswith('#'): #print line;time.sleep(0.5) tl=line.strip().split('\t') if tl[0]=='MT': #correct the mitochondrial chromosome designation for internal consistency tl[0]='M' mytl7 = tl[7].split(';') for x in mytl7: if 'RSPOS=' in x: mytl7.remove(x) if 'VP=' in x: mytl7.remove(x) ins = kgdict_table.insert() ins.execute( myindex=None, Chr= tl[0], #bin = '', pos = tl[1], id = tl[2], ref = tl[3], alt = tl[4], qual = tl[5], filter = tl[6], info = ';'.join(mytl7) ) #except Exception: # print "can't insert line in to db", tl if linecounter %500000 == 0: print linecounter #result=c.execute('select * from kgdict limit 10''') #print result.fetchone() #time.sleep(2) print 'database created at %s'%mycfg.dbsnp_db_path #c.execute('CREATE INDEX idx_kgdict ON kgdict (chr,bin,pos)') #conn.commit() #c.close() #os.system('cp %s %s'%(mydbpath, os.path.dirname(os.path.abspath(sys.argv[0])))) sendmail() def ESP_vcf2db(): """ Populates the mysql database with Exome Sequencing Project data """ ESP_variants_table = Table( 'ESP_variants',metadata, Column('myindex',Integer, primary_key = True), Column('Chr',String(10),nullable = False), #Column('bin',Integer,nullable=False), Column('pos',Integer,nullable=False, autoincrement = False), Column('id',String(100),nullable=False), Column('ref',String(800),nullable=False), Column('alt',String(300),nullable=False), Column('qual',String(200),nullable=False), Column('filter',String(20),nullable=False), Column('DBSNP',String(100),nullable=False), Column('EA_AC',String(100), nullable = False ), Column('AA_AC',String(100), nullable = False ), Column('TAC',String(100), nullable = False ), Column('info',String(700),nullable = False), ) Index('idx_ESP_variants', ESP_variants_table.c.Chr, ESP_variants_table.c.pos) metadata.create_all() linecounter=0 for line in open(args.input_file): linecounter+=1 if not line.startswith('#'): #print line;time.sleep(0.5) tl=line.strip().split('\t') if tl[0]=='MT': #correct the mitochondrial chromosome designation for internal consistency tl[0]='M' mytl7 = tl[7].split(';') #split info field dbsnp='' ea_ac='' aa_ac='' tac='' mytlinfo= deepcopy(mytl7) for x in mytl7: if 'DBSNP=' in x: dbsnp = x.split("=")[1] mytlinfo.remove(x) elif 'EA_AC=' in x: ea_ac = x.split("=")[1] mytlinfo.remove(x) elif 'AA_AC=' in x: aa_ac = x.split("=")[1] mytlinfo.remove(x) elif 'TAC=' in x: tac = x.split("=")[1] mytlinfo.remove(x) ins = ESP_variants_table.insert() ins.execute( myindex=None, Chr= tl[0], #bin = '', pos = tl[1], id = tl[2], ref = tl[3], alt = tl[4], qual = tl[5], filter = tl[6], DBSNP = dbsnp, EA_AC = ea_ac, AA_AC = aa_ac, TAC = tac, info = ';'.join(mytlinfo) ) if linecounter %500000 == 0: print linecounter print 'database created at mysql server ' sendmail() def prepare_liftover_input(vsfile, outfile): """ input: vsfile provided by the function call logic: create a bed file that contains chr start stop and index fields from\ the vs.filtered file output:created bed file to be used as input for the liftover program for adding\ hg19 coordinates to the files """ try: infile = open(vsfile) except Exception: print 'The input file %s was not found, please make sure you have supplied the correct path' raise Exception hline = infile.readline() htl = hline.strip().split('\t') fw = open(outfile,'w') #fw.write(inlines[0]) for line in infile: tl=line.strip().split('\t') ntl='%s '*4%(tl[htl.index('Chr')],tl[htl.index('LeftFlank')],tl[htl.index('RightFlank')],tl[htl.index('Index')]) #print 'ntl',ntl #time.sleep(100) fw.write(ntl+'\n') #time.sleep(2) def add_hg19(infile,hg19bed,hg19outfile): """ infile is the hg18 coordinate varsifter file, hg19coordinates in a liftover created bed file, [hg19outfile] is the outputfile name logic: adds hg19 coordinates to infile based on index numbers output: new file with hg19 coordinates added """ try: #create a dictionary where the key is the index number of the input file and the value #is the hg19 coordinates for faster search hg19file = open(hg19bed) hg19lifts={} for chr in chrs: hg19lifts[chr]={} """ for x in hg19file: tl=x.strip().split('\t') #if tl[0].lower()==chr.lower(): #hg19lifts[tl[3]]=tl[1:3] hg19lifts[tl[3]]=tl[1:3] """ for chr in chrs: hg19file.seek(0) for x in hg19file: tl=x.strip().split('\t') if tl[0].lower()==chr.lower(): hg19lifts[chr][tl[3]]=tl[1:3] except Exception: print 'hg19bed file was not read into memory. Please check add_hg19 function for details.' fw=open(hg19outfile,'w') infile = open(infile) hline = infile.readline() htl = hline.strip().split('\t') nhtl = htl[:htl.index('RightFlank')+1]+['hg19LeftFlank','hg19RightFlank']+htl[htl.index('RightFlank')+1:] fw.write('\t'.join(nhtl)+'\n') #outputcounter=0 for line in infile: tl=line.strip().split('\t') #print tl[0] ntl=[] #time.sleep(1) if hg19lifts.get(tl[htl.index('Chr')]): if tl[htl.index('Index')] in hg19lifts.get(tl[htl.index('Chr')]).keys(): #print 'tl',tl #print 'hg19lifts tl[]',hg19lifts[tl[0]] #outputcounter+=1 #print 'tl',tl #print 'hg19tl',hg19tl #ntl=tl[:4]+hg19tl[1:3]+tl[4:] ntl = tl[:htl.index('RightFlank')+1] + hg19lifts[tl[htl.index('Chr')]][tl[htl.index('Index')]] + tl[htl.index('RightFlank')+1:] #print 'ntl',ntl fw.write('\t'.join(ntl)+'\n') #time.sleep(1) #elif tl[0] not in hg19lifts[tl[htl.index('Chr')]].keys(): else: #outputcounter+=1 ntl=tl[:htl.index('RightFlank')+1]+['-','-']+tl[htl.index('RightFlank')+1:] fw.write('\t'.join(ntl)+'\n') #print 'Number of Output lines are %s '%outputcounter fw.close() return def single_gts(tl1,htl, sample_code): """ This function gets a line and a headerline as input, splits it by tab and determines the genotype of individual returns a dictionary with UDP code as keys and pt raw genotypes and pt calculated genotypes as values gts [0]= patient genotype with two alleles gts [1]= patient calculated zygosity state (hom_ref, hom_var, het_complete, var_incoplete, ref_incomplete, novel_mutation, ref_novel_het, var_novel_het, NA ) gts [2]= mpg score gts [3]= coverage gts [4]= sex gts [5]= affected gts [6]= mom udpcode gts [7]= dad udpcode gts [8]= founder GT in DB gts [9]= birth year """ test=0 gts={} if len(tl1) != len(htl): sys.stderr.write('tl != htl len(tl)= %s len(htl)=%s\n'%(len(tl1), len(htl))) sys.stderr.write('tl= %s\n'%tl1) sys.stderr.write('htl= %s\n'%htl) raise Exception myvarfile = varfile(htl) #myvarfile.get_project_members(project_code) #myaffected = myvarfile.project_members pos = myvarfile.first_sample_index if test: print 'this is pos as a result of using myvarfile.first_sample_index\ with htl ',pos print 'this is tl [pos-5:pos+5]',tl1[int(pos)-5:int(pos)+5] print 'This is the item at tl[pos]',tl1[pos] time.sleep(1) print '\nline in get_genotypoes() is ',tl1 print 'patient data in line in get_genotypes_ is ', tl1[pos-1:] #print 'number of patients determined by get_genotypes_() is ', len(myaffected) time.sleep(1) #for sample_code,v in myaffected.iteritems(): #print 'in get_genotypes_ function: pt_code is => %s'%sample_code #time.sleep(1) #print "data dictionary = 1:hom ref,2:hom var, 3:het complete,\ #4:ref incomplete,5:var incompelete, 6:single novel mutation,\ #7:ref novel het, 8:var novel het , 9:hom_novel" gt_class="" if len(tl1[htl.index('%s.NA'%sample_code)])<2: if test: print "found incomplete genotype",tl1 print tl1[htl.index('%s.NA'%sample_code)][allele] #if tl1[htl.index('%s.NA'%sample_code)]==tl1[myfilespecs["var_allele"]]: if tl1[htl.index('%s.NA'%sample_code)] == tl1[htl.index('var_allele')]: gt_class="var_incomplete" #elif tl1[htl.index('%s.NA'%sample_code)]==tl1[myfilespecs["ref_allele"]]: elif tl1[htl.index('%s.NA'%sample_code)] == tl1[htl.index('ref_allele')]: gt_class="ref_incomplete" elif (tl1[htl.index('%s.NA'%sample_code)] != tl1[htl.index("ref_allele")] and tl1[htl.index('%s.NA'%sample_code)] != tl1[htl.index("var_allele")]): gt_class="novel_mutation" else: print "something wrong check the row",tl1 #time.sleep(1) elif len(tl1[htl.index('%s.NA'%sample_code)])==2: varcount=0 refcount=0 novelcount=0 if tl1[htl.index('%s.NA'%sample_code)]=="NA": gt_class="NA" else: #print "this is a valid genotype =>",tl1[htl.index('%s.NA'%sample_code)] #print tl1[htl.index('%s.NA'%sample_code)] #print tl1[htl.index('var_allele']] for allele in xrange(2): #print tl1[htl.index('%s.NA'%sample_code)][allele] if tl1[htl.index('%s.NA'%sample_code)][allele]==tl1[htl.index("var_allele")]: varcount+=1 elif tl1[htl.index('%s.NA'%sample_code)][allele]==tl1[htl.index("ref_allele")]: refcount+=1 elif (tl1[htl.index('%s.NA'%sample_code)][allele]!=tl1[htl.index("ref_allele")] and tl1[htl.index('%s.NA'%sample_code)][allele]!=tl1[htl.index("var_allele")]): novelcount+=1 #a=freq[tl1[htl.index('%s.NA'%sample_code)][allele]] #print a[0],a[1] #freql.append(a[1]+1) #freq[tl1[htl.index('%s.NA'%sample_code)][allele]]=[gt_id[0][1],a[1]+1] #print "varcount",varcount #print "ref count",refcount if varcount==2: gt_class="hom_var" elif refcount==2: gt_class="hom_ref" elif novelcount==2: gt_class="hom_novel" elif refcount==1 and varcount==1: gt_class="het_complete" elif refcount==1 and novelcount==1: gt_class="ref_novel_het" elif varcount==1 and novelcount==1: gt_class="var_novel_het" else: print "something wrong check the row",tl1[a:a+3] #print 'line is', line #print 'calculated genotype class is',gt_class #time.sleep(1) elif len(tl1[htl.index('%s.NA'%sample_code)])>2: if ":" in tl1[htl.index('%s.NA'%sample_code)]: varcount=0 refcount=0 novelcount=0 #print 'found a double colon seperated one',tl1 for allele in xrange(2): #print "allele = %s "%allele allele_gt="" if allele==0: allele_gt=tl1[htl.index('%s.NA'%sample_code)][:(tl1[htl.index('%s.NA'%sample_code)].index(":"))] elif allele==1: allele_gt=tl1[htl.index('%s.NA'%sample_code)][(tl1[htl.index('%s.NA'%sample_code)].index(":"))+1:] else: print "allele gt could not be determined" print "allele_gt is:",allele_gt #print tl1[htl.index('%s.NA'%sample_code)][allele] if allele_gt==tl1[htl.index("ref_allele")]: refcount+=1 elif allele_gt==tl1[htl.index("var_allele")]: varcount+=1 elif allele_gt!=tl1[htl.index("ref_allele")] and allele_gt!=tl1[htl.index("var_allele")]: novelcount+=1 else: print "something wrong with determining the genotype counts , check the row",tl1[a:a+3] if varcount==2: gt_class="hom_var" elif refcount==2: gt_class="hom_ref" elif novelcount==2: gt_class="hom_novel" elif refcount==1 and varcount==1: gt_class="het_complete" elif refcount==1 and novelcount==1: gt_class="ref_novel_het" elif varcount==1 and novelcount==1: gt_class="var_novel_het" else: print "ref gt=",tl1[htl.index("ref_allele")],len(tl1[htl.index("ref_allele")]) print "var gt=",tl1[htl.index("var_allele")],len(tl1[htl.index("var_allele")]) print "patient gt=",tl1[htl.index('%s.NA'%sample_code)],tl1[htl.index('%s.NA'%sample_code)][:(tl1[htl.index('%s.NA'%sample_code)].index(":"))],len(tl1[htl.index('%s.NA'%sample_code)][:(tl1[htl.index('%s.NA'%sample_code)].index(":"))]),tl1[htl.index('%s.NA'%sample_code)][(tl1[htl.index('%s.NA'%sample_code)].index(":"))+1:],len(tl1[htl.index('%s.NA'%sample_code)][(tl1[htl.index('%s.NA'%sample_code)].index(":"))+1:]) print "rc",refcount,"vc",varcount,"nc",novelcount print "something wrong determining gt class variables, check the row",tl1[a:a+3] #print ">2 freq1",freq1 #time.sleep(1) else: if tl1[htl.index('%s.NA'%sample_code)]==tl1[htl.index("var_allele")]: gt_class="hom_var" elif tl1[htl.index('%s.NA'%sample_code)]==tl1[htl.index("ref_allele")]: gt_class="hom_ref" elif tl1[htl.index('%s.NA'%sample_code)]!=tl1[htl.index("ref_allele")] and tl1[htl.index('%s.NA'%sample_code)]!=tl1[htl.index("var_allele")]: gt_class="het_complete" else: print "something wrong check the row",tl1[a:a+3],tl1[htl.index('%s.NA'%sample_code)] else: print 'row is not 2 or more characters long check row',tl1 #read family data table in to a dictionary #patient_ids, family_data = p_ids() #print 'pt_code in get genotypes() function is =>',sample_code gts[sample_code] = [tl1[htl.index('%s.NA'%sample_code)], gt_class, tl1[htl.index('%s.NA'%sample_code)+1], #tl1[htl.index('%s.NA'%sample_code)+2]] + family_data[sample_code] tl1[htl.index('%s.NA'%sample_code)+2]] """ print 'could not get value into gts dictionary, family data may be'\ 'inconsistent with the variant report file make sure the patient'\ 'codes match between the two files' print 'Family data returned by p_ids() function as family_data =>',family_data print '\npt_code in get genotypes() is %s \n'%sample_code print 'tl1:',tl1 print 'gts dictionary is as follows' for k,v in gts.items(): print k,'==>',v raise Exception #print 'counter is %s'%counter #time.sleep(.5) """ #test code for the fucntion #if len(gts)!= len(myaffected): # print "num_pts is %s"%len(myaffected) # print "gts is %s items long"%len(gts) # print tl1 # print 'gts:',gts # raise Exception #for k,v in gts.items(): #print 'genotype dict entry is k =%s v=>%s'%(k,v) #time.sleep(1) # assert len(v)==11,'genotypes dictionary was not constructed correctly, please\ # check the results k=%s, v=%s v is %s items long'%(k,v,len(v)) #time.sleep(1) #print 'testcode gts=',gts #print tl1 #time.sleep(30) return gts def split_allfams_bysample(): """ takes and allfams file and takes all variant columns and a single sample that is supplied by udp numbers """ log_file_path = os.path.join(os.path.dirname(args.output_file), 'log.txt') log_file = open(log_file_path, 'a') infile = open(args.input_file) hline =infile.readline() htl = hline.strip().split('\t') myvarfile = varfile(htl) #print myvarfile.first_sample_index nhtl = htl[:myvarfile.first_sample_index] fam_members_indexes=[] no_samples_in_current_all_projects_file = True if '%s.NA'%args.project_code in htl: #print htl.index('%s.NA'%sample) fam_members_indexes.append(htl.index('%s.NA'%args.project_code)) log_file.write('Sample %s is found in the input allfamilies file\n'%(args.project_code)) no_samples_in_current_all_projects_file = False else: log_file.write('Sample %s is not found in the input allfamilies file\n'%(args.project_code)) for x in fam_members_indexes: nhtl.append(htl[x]) nhtl.append(htl[x+1]) nhtl.append(htl[x+2]) myout = open(args.output_file,'w') if args.extract_subject_only: myout.write('\t'.join(nhtl)+'\n') else: myout.write('\t'.join(htl)+'\n') if no_samples_in_current_all_projects_file: pass else: #c=0 #c1=0 for line in infile: #c +=1 #print c #if c>200: # break tl = line.strip().split('\t') ntl = tl[:myvarfile.first_sample_index] for x in fam_members_indexes: ntl.append(tl[x]) ntl.append(tl[x+1]) ntl.append(tl[x+2]) mygts = single_gts(ntl, nhtl , args.project_code) assert len(mygts)==1 all_are_homref_or_NA = True gt_homref_and_ref_is_minor = False for k,v in mygts.iteritems(): if (v[1] != 'hom_ref' and not tl[htl.index('All_Gahlref_is_minor')]=="1") and v[1] != 'NA': all_are_homref_or_NA = False if (v[1] == 'hom_ref' and tl[htl.index('All_Gahlref_is_minor')]=="1"): gt_homref_and_ref_is_minor = True if not all_are_homref_or_NA or gt_homref_and_ref_is_minor: #c1+=1 #print mygts #print tl[htl.index('All_Gahlref_is_minor')] #time.sleep(0.1) if args.extract_subject_only: myout.write('\t'.join(ntl)+'\n') else: myout.write('\t'.join(tl)+'\n') myout.close() #else: # sys.stderr.write(line) #myvarfile = varfile(args.input_file) #myvarfile.get_project_members(args.project_code) #myaffected = myvarfile.project_members def split_allfams_byfamily(): """ takes and allfams file and takes all variant columns and only samples that are supplied in the family file by udp numbers """ samples_table = Table('varmd_samples_table', metadata, autoload=True) infile = open(args.input_file) hline =infile.readline() htl = hline.strip().split('\t') myvarfile = varfile(htl) myaffected = myvarfile.get_project_members(args.project_code) #print myvarfile.first_sample_index nhtl = htl[:myvarfile.first_sample_index] fam_members_indexes=[] no_samples_in_current_all_projects_file = True for sample in myaffected.keys(): if '%s.NA'%sample in htl: #print htl.index('%s.NA'%sample) fam_members_indexes.append(htl.index('%s.NA'%sample)) sys.stdout.write('FAMILY/PROJECT CODE= %s : Sample %s is found in the input allfamilies file\n'%(args.project_code, sample)) updt = samples_table.update().where(samples_table.c.sample_code == sample).values( data_in_allprojects_file = 'Yes') updt.execute() no_samples_in_current_all_projects_file = False else: sys.stdout.write('NOTICE: FAMILY/PROJECT CODE= %s : Sample %s is not found in the input allfamilies file\n'%(args.project_code, sample)) updt = samples_table.update().where(samples_table.c.sample_code == sample).values( data_in_allprojects_file = 'No') updt.execute() #raise Exception for x in fam_members_indexes: nhtl.append(htl[x]) nhtl.append(htl[x+1]) nhtl.append(htl[x+2]) myout = open(args.output_file,'w') myout.write('\t'.join(nhtl)+'\n') if no_samples_in_current_all_projects_file: pass else: first_sample_index = myvarfile.first_sample_index patient_ids, family_data = p_ids() different_phenotypes_are_present = False if len(set([v['affected'] for v in myaffected.values()]) ) > 1: different_phenotypes_are_present = True #c=0 for line in infile: #c +=1 #print c #if c>800: # break tl = line.strip().split('\t') ntl = tl[:first_sample_index] for x in fam_members_indexes: ntl.append(tl[x]) ntl.append(tl[x+1]) ntl.append(tl[x+2]) mygts = get_genotypes(ntl, nhtl, myaffected, first_sample_index, family_data) all_affecteds_are_hom_ref = True all_are_homref_or_NA = True all_genotypes_are_same = True if len(set([v[1] for v in mygts.values()])) > 1: all_genotypes_are_same = False elif len(set([v[1] for v in mygts.values()])) == 1: all_genotypes_are_same = True elif len(set([v[1] for v in mygts.values()])) == 0 and len(mygts) == 0: pass else: for k,v in mygts.iteritems(): print k,v print (set([v[1] for v in mygts.values()])) raise Exception for k,v in mygts.iteritems(): if v[1] != 'hom_ref' and v[1] != 'NA': all_are_homref_or_NA = False if int(myaffected[k]['affected']) == 1 and v[1] != 'hom_ref': all_affecteds_are_hom_ref = False if ( (not all_are_homref_or_NA) and (not all_affecteds_are_hom_ref) and (not (different_phenotypes_are_present and all_genotypes_are_same)) ): myout.write('\t'.join(ntl)+'\n') myout.close() #else: # sys.stderr.write(line) #myvarfile = varfile(args.input_file) #myvarfile.get_project_members(args.project_code) #myaffected = myvarfile.project_members def annotate_with_interval_data(): """ python --func annotate_with_interval_data --input [varmd_file] --annotation_file [validation_file] > outputfile """ annot_lines = [x.strip().split('\t') for x in open(args.annotation_file) if 'chr' in x.lower()] if (not 'Chr' in annot_lines[0]) or (not 'LeftFlank' in annot_lines[0]) or (not 'RightFlank' in annot_lines[0]) : raise Exception assert len(annot_lines) >=2, 'annotation file seems empty' infile = open(args.input_file) htl = infile.readline().strip().split('\t') if (not 'Chr' in htl) or (not 'LeftFlank' in htl) or (not 'RightFlank' in htl) : print htl raise Exception htl.insert(0,'Validation') print '\t'.join(htl) used_for_annotations = [] for line in infile: tl = line.strip().split('\t') tl.insert(0,'-') for atl in annot_lines[1:]: if (tl[htl.index('Chr')] == atl[annot_lines[0].index('Chr')] and tl[htl.index('LeftFlank')] == atl[annot_lines[0].index('LeftFlank')] and tl[htl.index('RightFlank')] == atl[annot_lines[0].index('RightFlank')]): tl[htl.index('Validation')] = atl[0] annot_lines.remove(atl) used_for_annotations.append(atl) print '\t'.join(tl) """ print 'tl',tl print 'atl',atl print 'annot_lines',annot_lines print 'annot_lines[0]',annot_lines[0] print 'htl.index(chr)',htl.index('Chr') """ with open('%s.used'%args.annotation_file,'a') as fw: for atl in used_for_annotations: fw.write('\t'.join(atl)+'\n') def run_analysis_single_file(configfile): """ creates the necessary files for analysis pipeline workflow under Biowulf cluster """ test=0 verbose=0 skip_hg19_add_flag = 0 nodetype = mycfg.nodetype #print os.path.dirname(inputvsfile) if args.input_file: inputvsfile = os.path.abspath(args.input_file) else: print 'Please provide a varmd input file by --input [varmd_input_file_path]' sys.exit() if args.project_code: project_code = args.project_code else: print 'please provide a project code by --project_code [project_code]' sys.exit() tmpdir='{0}_tmp'.format(os.path.join( os.path.split(os.path.dirname(inputvsfile))[0], os.path.basename(inputvsfile) )) #tmpdir='./{}_tmp'.format(os.path.basename(inputvsfile)) if verbose: print os.path.dirname(os.path.abspath(inputvsfile)) print os.path.basename(inputvsfile) print 'tmpdir',tmpdir if not os.path.exists(tmpdir): os.mkdir(tmpdir) masterpath='%s/%s_master.sh'%(tmpdir,os.path.basename(inputvsfile)) fm=open(masterpath,'w') if nodetype=='g72': nodetypeswarm=':g72 -n 8' nodetypeqsub=':g72' elif nodetype=='g24': nodetypeswarm=':g24 -n 8' nodetypeqsub=':g24' elif nodetype=='norm': nodetypeswarm='' nodetypeqsub='' else: print 'something wrong with the node selection' sys.exit() depend=0 with open(inputvsfile) as myvsfile: hline = myvsfile.readline() if ('hg19LeftFlank' in hline) and ('hg19RightFlank' in hline): skip_hg19_add_flag =1 if skip_hg19_add_flag: #the header line should be processed and hg19LeftFlank and hg19RightFlank should be present in the file hg19outfile = '{0}/{1}_withhg19'.format(tmpdir,os.path.basename(inputvsfile)) os.system('cp {0} {1}'.format(inputvsfile,hg19outfile)) else: ############################################################################## outfile='%s/%s_liftbed'%(tmpdir,os.path.basename(inputvsfile)) prep_liftover_path='%s/1_%s_prepare_liftover_input.sh'%(tmpdir, os.path.basename(inputvsfile) ) plf=open(prep_liftover_path,'w') #plf.write('%s %s --func prepare_liftover_input %s %s'%(myutils.python_path, plf.write('%s --func prepare_liftover_input --config %s --input %s --output %s'%( sys.argv[0], configfile, inputvsfile, outfile )) plf.close() #prepare the input file for liftOver depend+=1 fm.write('jobid{0}=`qsub -l nodes=1{1} -N lft2{3} {4}`\n'.format( depend, nodetypeqsub, depend-1, project_code, prep_liftover_path)) fm.write('echo $jobid{0}\n'.format(depend)) ############################################################################### liftoverpath='%s/2_%s.fixed_liftover.sh'%(tmpdir,os.path.basename(inputvsfile)) lf=open(liftoverpath,'w') liftoverout = '{0}/{1}_liftedbed'.format(tmpdir,os.path.basename(inputvsfile)) liftoverunmapped = '{0}/{1}_hg19liftover_unmapped'.format(tmpdir,os.path.basename(inputvsfile)) lf.write('{0} {1} {2} {3} {4}'.format( mycfg.liftOver_path, outfile, mycfg.liftOver_chain_path, liftoverout, liftoverunmapped )) lf.close() #print 'check liftover file at %s'%liftoverpath #time.sleep(1000) #run liftOver depend+=1 fm.write('jobid{0}=`qsub -l nodes=1{1} -W depend=afterany:$jobid{2} -N lftr3{3} {4}`\n'.format( depend, nodetypeqsub, depend-1, project_code, liftoverpath)) fm.write('echo $jobid{0}\n'.format(depend)) ################################################################################ add_hg19_path='%s/3_%s_add_hg19.sh'%(tmpdir,os.path.basename(inputvsfile)) ahgf=open(add_hg19_path,'w') hg19outfile = '{0}/{1}_withhg19'.format(tmpdir,os.path.basename(inputvsfile)) ahgf.write('{0} --func add_hg19 --config {1} --input {2} --liftover {3} --output {4}\n'.format( sys.argv[0], configfile, inputvsfile, liftoverout, hg19outfile, )) ahgf.close() #add hg19 coordinates to file based on liftOver output depend+=1 fm.write('jobid{0}=`swarm --jobarray --singleout\ -f {1}\ -W depend=afterany:$jobid{2}\ -l nodes=1{3}\ -N ad19_4{4}\ -b 3`\n'.format(depend, add_hg19_path, depend-1, nodetypeswarm, project_code) ) fm.write('echo $jobid{0}\n'.format(depend)) #################################################################################### skip_seattleseq_flag = False with open(inputvsfile) as myvsfile: hline = myvsfile.readline() if ('ss_chromosome' in hline) and ('ss_position' in hline): skip_seattleseq_flag = True prep_ss_path='%s/4_%s_prep_seattleSeq.sh'%(tmpdir,os.path.basename(inputvsfile)) fw=open(prep_ss_path,'w') outpath_snv = os.path.splitext(hg19outfile)[0]+'.ss_snv' outpath_indel = os.path.splitext(hg19outfile)[0]+'.ss_indel' fw.write('{0} --func prep_ss --input {1} --output_snv {2} --output_indel {3} --config {4}\n'.format( sys.argv[0], hg19outfile, outpath_snv, outpath_indel, configfile )) fw.close() # depend+=1 fm.write('jobid{0}=`qsub -l nodes=1{1} -W depend=afterany:$jobid{2}\ -N pss_51{3} {4}`\n'.format(depend, nodetypeqsub, depend-1, project_code, prep_ss_path )) fm.write('echo $jobid{0}\n'.format(depend)) ################################################################################### #incoporate Kaviar, 1000 genomes and allsamples file data into varsifter files dbsnp_outfile = '{0}_dbsnp'.format(hg19outfile) annotate_dbsnp_path='%s/5_%s.annotate_with_dbsnp.sh'%(tmpdir,os.path.basename(inputvsfile)) dbsnp_in_file_flag = False dbsnpheader_in_file_flag = True if os.path.exists(hg19outfile): if verbose: print 'hg19 file exists' , hg19outfile with open(hg19outfile) as myvsfile: hline = myvsfile.readline() htl = hline.strip().split('\t') headers_indbsnp_added_file = [ 'Homozygous_VarAllele_UDP_count', 'Single_VarAllele_UDP_count', 'all_mpg:coverage_score', 'dbsnp137G5', 'dbsnp137ClinicalSignificance', 'dbsnp137GMAF', 'dbsnp137RS#', 'dbsnp137', 'predicted_expression_snp', 'varscore', ] for header_item in headers_indbsnp_added_file: if header_item not in hline: dbsnpheader_in_file_flag = False if verbose: print 'header item not found in the file header item not found=%s'%header_item if dbsnpheader_in_file_flag: dbsnp = [] linecounter =0 for line in myvsfile: tl = line.strip().split('\t') dbsnp.append(tl[htl.index('dbsnp137')]) if linecounter > 500: break myset= set(dbsnp) if len(myset)>1: dbsnp_in_file_flag = True if dbsnp_in_file_flag and dbsnpheader_in_file_flag: #the header line should be processed and hg19LeftFlank and hg19RightFlank should be present in the file if verbose: print 'dbsnp in file flag true, dbsnp header in file flag true' os.system('cp {0} {1}'.format(hg19outfile,dbsnp_outfile)) f_tmp = open(annotate_dbsnp_path,'w') f_tmp.write('{0} --func annotate_with_dbsnp --config {1} --input {2} --output {3}\n'.format( sys.argv[0], configfile, hg19outfile, dbsnp_outfile, )) f_tmp.close() if not os.path.exists(hg19outfile) or not dbsnp_in_file_flag: if verbose: print 'hg19 file doesn\'t exist or dbsnp data not in file' dbsnpheader_in_file_flag = False f_tmp = open(annotate_dbsnp_path,'w') f_tmp.write('{0} --func annotate_with_dbsnp --config {1} --input {2} --output {3}\n'.format( sys.argv[0], configfile, hg19outfile, dbsnp_outfile, )) f_tmp.close() depend+=1 if depend>1: fm.write('jobid{0}=`swarm --jobarray --singleout -f {1}\ -W depend=afterany:$jobid{2} -l nodes=1{3} -N ann6{4}\ -p 100 -b 8 `\n'.format(depend, annotate_dbsnp_path, depend-1, nodetypeswarm, project_code )) elif depend<=1: fm.write('jobid{0}=`swarm --jobarray --singleout -f {1}\ -l nodes=1{2} -N ann6{3} -p 100 -b 8`\n'.format(depend, annotate_dbsnp_path, nodetypeswarm, project_code )) fm.write('echo $jobid{0}\n'.format(depend)) #################################################################################### allfams_outfile= '{0}_pop'.format(dbsnp_outfile) allfams_in_file_flag=False if os.path.exists(dbsnp_outfile): with open(dbsnp_outfile) as myvsfile: hline = myvsfile.readline() htl = hline.strip().split('\t') linecounter =0 single_VariantAllele_UDP_counts = [] if 'Single_VarAllele_UDP_count' in hline: for line in myvsfile: tl = line.strip().split('\t') single_VariantAllele_UDP_counts.append(tl[htl.index('Single_VarAllele_UDP_count')]) if linecounter > 500: break myset= set(single_VariantAllele_UDP_counts) if len(myset)>1: allfams_in_file_flag = True if allfams_in_file_flag: os.system('cp {0} {1}'.format(dbsnp_outfile,allfams_outfile )) else: annotate_allfams_path='%s/6_%s.annotate_with_allfams.sh'%(tmpdir,os.path.basename(inputvsfile)) f_tmpswarm=open(annotate_allfams_path,'w') f_tmpswarm.write('{0} --func annotate_with_allfams --config {1} --input {2}_dbsnp\ --output {2}_dbsnp_pop\n'.format( sys.argv[0], configfile, hg19outfile, )) f_tmpswarm.close() #------------------------------------------------------------------------------- #incoporate allsamples file data into varsifter files depend+=1 if depend>1: fm.write('jobid{0}=`swarm --jobarray --singleout -f {1}\ -W depend=afterany:$jobid{2} -l nodes=1{3} -N af7{4}\ -p 100 -b 2`\n'.format(depend, annotate_allfams_path, depend-1, nodetypeswarm, project_code )) elif depend<=1: fm.write('jobid{0}=`swarm --jobarray --singleout -f {1}\ -l nodes=1{2} -N af7{3} -p 100 -b 2`\n'.format(depend, annotate_allfams_path, nodetypeswarm, project_code )) fm.write('echo $jobid{0}\n'.format(depend)) ############################################################################### seattle_seq_snv_path='%s/7_%s_seattleseq_snv.sh'%(tmpdir, os.path.basename(inputvsfile) ) sssnv_outfile ="%s_ss_snv"%allfams_outfile ffw=open(seattle_seq_snv_path,'w') ffw.write('{0} --func annotate_ss_snv --config {1} --input {2}\ --output {3}\n'.format( sys.argv[0], configfile, allfams_outfile, sssnv_outfile, )) ffw.close() if skip_seattleseq_flag == True: os.system('cp {0} {1}'.format(allfams_outfile ,sssnv_outfile)) depend+=1 if depend>1: fm.write('jobid{0}=`swarm --jobarray --singleout -f {1}\ -W depend=afterany:$jobid{2} -l nodes=1{3} -N sssnv{4}\ -p 100 -b 2 `\n'.format(depend, seattle_seq_snv_path, depend-1, nodetypeswarm, project_code )) elif depend<=1: fm.write('jobid{0}=`swarm --jobarray --singleout -f {1}\ -l nodes=1{2} -N sssnv{3} -p 100 -b 2 `\n'.format(depend, seattle_seq_snv_path, nodetypeswarm, project_code )) fm.write('echo $jobid{0}\n'.format(depend)) ############################################################################### seattle_seq_indel_path='%s/8_%s_seattleseq_indel.sh'%(tmpdir, os.path.basename(inputvsfile) ) ssindel_outfile ="%s_indel"%sssnv_outfile ffw=open(seattle_seq_indel_path,'w') ffw.write('{0} --func annotate_ss_indel --config {1} --input {2}\ --output {3}\n'.format( sys.argv[0], configfile, sssnv_outfile, ssindel_outfile, )) ffw.close() if skip_seattleseq_flag == True: os.system('cp {0} {1}'.format(sssnv_outfile,ssindel_outfile)) depend+=1 if depend>1: fm.write('jobid{0}=`swarm --jobarray --singleout -f {1}\ -W depend=afterany:$jobid{2} -l nodes=1{3} -N ssind{4}\ -p 100 -b 2 `\n'.format(depend, seattle_seq_indel_path, depend-1, nodetypeswarm, project_code )) elif depend<=1: fm.write('jobid{0}=`swarm --jobarray --singleout -f {1}\ -l nodes=1{2} -N ssind{3} -p 100 -b 2 `\n'.format(depend, seattle_seq_indel_path, nodetypeswarm, project_code )) fm.write('echo $jobid{0}\n'.format(depend)) ############################################################################### annotate_with_ESP_path='%s/9_%s_annotate_with_ESP.sh'%(tmpdir, os.path.basename(inputvsfile) ) annotate_with_ESP_outfile ="%s_ESP"%ssindel_outfile ffw = open(annotate_with_ESP_path,'w') ffw.write('{0} --func annotate_with_ESP --config {1} --input {2}\ --output {3}\n'.format( sys.argv[0], configfile, ssindel_outfile, annotate_with_ESP_outfile, )) ffw.close() depend+=1 if depend>1: fm.write('jobid{0}=`swarm --jobarray --singleout -f {1}\ -W depend=afterany:$jobid{2} -l nodes=1{3} -N ssind{4}\ -p 100 -b 2 `\n'.format(depend, annotate_with_ESP_path, depend-1, nodetypeswarm, project_code )) elif depend<=1: fm.write('jobid{0}=`swarm --jobarray --singleout -f {1}\ -l nodes=1{2} -N ssind{3} -p 100 -b 2 `\n'.format(depend, annotate_with_ESP_path, nodetypeswarm, project_code )) fm.write('echo $jobid{0}\n'.format(depend)) ##################################################################################### scandirscdpred_path='%s/10_%s.scandirs_cdpred'%(tmpdir,os.path.basename(inputvsfile)) fscdr = open(scandirscdpred_path,'w') cdpredoutfile='%s.cdpredneg'%annotate_with_ESP_outfile fscdr.write('{0} --func scandirs_cdpred --config {1} --input {2} --output {3} \n'.format( sys.argv[0], configfile, annotate_with_ESP_outfile, cdpredoutfile, )) fscdr.close() #------------------------------------------------------------------------------- #filter for only cdpred negative(predicted deleterious) variants depend+=1 if depend>1: fm.write('jobid{0}=`qsub -l nodes=1{1} -W depend=afterany:$jobid{2}\ -N cdn10{3} {4}`\n'.format(depend, nodetypeqsub, depend-1, project_code, scandirscdpred_path )) fm.write('echo $jobid{0}\n'.format(depend)) elif depend<=1: fm.write('jobid{0}=`qsub -l nodes=1{1} -N cdn10{3} {4}`\n'.format(depend, nodetypeqsub, depend-1, project_code, scandirscdpred_path )) fm.write('echo $jobid{0}\n'.format(depend)) ################################################################################### #filters for hom_rec, dom and x-linked rec simple_mendelian_filters_path = '%s/11_%s.mendelian_filters'%(tmpdir,os.path.basename(inputvsfile)) fmhr = open(simple_mendelian_filters_path,'w') output_path = '{0}.mendelian'.format(cdpredoutfile) include_bed_file = args.include_bed_file linkage_bed_file = args.linkage_bed_file cnv_bed_file = args.cnv_bed_file if args.nextgene_data: nextgene_data_flag = "--nextgene_data" else: nextgene_data_flag = "" fmhr.write('{0} --func simple_mendelian_filters --config {1} --input {2}\ --include_bed_file {3} --linkage_bed_file {4} --cnv_bed_file {5} --output {6} {7} {8}\n'.format( sys.argv[0], configfile, cdpredoutfile, include_bed_file, linkage_bed_file, cnv_bed_file, output_path, project_code, nextgene_data_flag, )) fmhr.close() depend+=1 if depend>1: fm.write('jobid{0}=`qsub -l nodes=1{1} -W depend=afterany:$jobid{2}\ -N hm11{3} {4}`\n'.format(depend, nodetypeqsub, depend-1, project_code, simple_mendelian_filters_path )) fm.write('echo $jobid{0}\n'.format(depend)) elif depend<=1: fm.write('jobid{0}=`qsub -l nodes=1{1} -N hm11{3} {4}`\n'.format(depend, nodetypeqsub, depend-1, project_code, simple_mendelian_filters_path )) fm.write('echo $jobid{0}\n'.format(depend)) #------------------------------------------------------------------------------- #filters for hom_rec, dom and x-linked rec and compound heterezygous comhet_mendelian_filter_path='%s/12_%s.comhet_mendelian_filter'%(tmpdir,os.path.basename(inputvsfile)) fmhr = open(comhet_mendelian_filter_path,'w') comhet_output = '{0}.com_het'.format(cdpredoutfile) include_bed_file = args.include_bed_file linkage_bed_file = args.linkage_bed_file cnv_bed_file = args.cnv_bed_file if args.nextgene_data: nextgene_data_flag="--nextgene_data" else: nextgene_data_flag="" fmhr.write('{0} --func mendelian_filter_com_het_res --config {1} --input {2} --output {3} {4} {5}\n'.format( sys.argv[0], configfile, cdpredoutfile, comhet_output, project_code, nextgene_data_flag, )) fmhr.close() depend+=1 if depend>1: fm.write('jobid{0}=`qsub -l nodes=1{1} -W depend=afterany:$jobid{2}\ -N hm11{3} {4}`\n'.format(depend, nodetypeqsub, depend-1, project_code, comhet_mendelian_filter_path )) fm.write('echo $jobid{0}\n'.format(depend)) elif depend<=1: fm.write('jobid{0}=`qsub -l nodes=1{1} -N hm11{3} {4}`\n'.format(depend, nodetypeqsub, depend-1, project_code, comhet_mendelian_filter_path )) fm.write('echo $jobid{0}\n'.format(depend)) #------------------------------------------------------------------------------- fm.close() subprocess.call(['chmod','754',masterpath]) #subprocess.call(['/bin/bash',masterpath]) #------------------------------------------------------------------------------- #sequencial run script steps = [x for x in range(12)] if skip_hg19_add_flag == 1: steps.remove(0) steps.remove(1) steps.remove(2) if skip_seattleseq_flag: steps.remove(3) steps.remove(6) steps.remove(7) if dbsnpheader_in_file_flag: steps.remove(4) if allfams_in_file_flag: steps.remove(5) masterpath_seq='%s/%s_master_seq.sh'%(tmpdir,os.path.basename(inputvsfile)) #sequencial run fm_seq=open(masterpath_seq,'w') fm_seq.write('#!/usr/bin/env bash\n') if 0 in steps: fm_seq.write('{0}\n'.format(prep_liftover_path)) subprocess.call(['chmod','754',prep_liftover_path]) if 1 in steps: fm_seq.write('{0}\n'.format(liftoverpath)) subprocess.call(['chmod','754',liftoverpath]) if 2 in steps: fm_seq.write('{0}\n'.format(add_hg19_path)) subprocess.call(['chmod','754',add_hg19_path]) if 3 in steps: fm_seq.write('{0}\n'.format(prep_ss_path)) subprocess.call(['chmod','754',prep_ss_path]) if 4 in steps: fm_seq.write('{0}\n'.format(annotate_dbsnp_path)) subprocess.call(['chmod','754',annotate_dbsnp_path]) if 5 in steps: fm_seq.write('{0}\n'.format(annotate_allfams_path)) subprocess.call(['chmod','754',annotate_allfams_path]) if 6 in steps: fm_seq.write('{0}\n'.format(seattle_seq_snv_path)) subprocess.call(['chmod','754',seattle_seq_snv_path]) if 7 in steps: fm_seq.write('{0}\n'.format(seattle_seq_indel_path)) subprocess.call(['chmod','754',seattle_seq_indel_path]) if 8 in steps: fm_seq.write('{0}\n'.format(annotate_with_ESP_path)) subprocess.call(['chmod','754',annotate_with_ESP_path]) if 9 in steps: fm_seq.write('{0}\n'.format(scandirscdpred_path)) subprocess.call(['chmod','754',scandirscdpred_path]) if 10 in steps: fm_seq.write('{0}\n'.format(simple_mendelian_filters_path)) subprocess.call(['chmod','754',simple_mendelian_filters_path]) if 11 in steps: fm_seq.write('{0}\n'.format(comhet_mendelian_filter_path)) subprocess.call(['chmod','754',comhet_mendelian_filter_path]) fm_seq.close() subprocess.call(['chmod','754',masterpath_seq]) print 'varmd_input_file_path:{0}'.format(inputvsfile) print 'varmd_distributed_qsub_script_path:{0}'.format(masterpath) print 'varmd_sequential_script_path:{0}'.format(masterpath_seq) #subprocess.call(['/bin/bash',masterpath_seq]) def prep_seattle_seq_input(hg19file, outpath_snv, outpath_indel, configfile): """ creates the input files that need to be uploaded to Seatlle Seq web site for annotation If in the first pass a seattle seq input file is generated, run those files through seattleseq web site and put the seattleseq output files in the same directory with the main annotation file and run this function again it will look for the incremental annotation file and merge it with the main file """ sys.stderr.write('NOTICE: varmd(prepare_seattleseq_input) started for %s\n'%hg19file) import gzip myfile = open(hg19file) hline = myfile.readline() htl = hline.strip().split('\t') snpcounter=0 indelcounter=0 outfile_snv = open(outpath_snv,'w') outfile_indel = open(outpath_indel,'w') snvdict={} indeldict={} filesdict={} filessnvcount=0 filesindelcount=0 filesnvsize1=0 fileindelsize1=0 bigfilesnvsize=0 bigfileindelsize=0 totalsnvlines=[] totalindellines=[] headersnv1=[] headersnv2=[] headerindel1=[] headerindel2=[] for file_ in glob.glob(os.path.join(mycfg.seattleseq_dir_path,'*')): if ('SeattleSeqAnnotation' in file_) and ('ss_snv' in file_): sys.stderr.write('found snv file %s\n'%file_) filessnvcount+=1 filesdict[file_]={} filesdict[file_]['size']=os.path.getsize(file_) sys.stderr.write('Reading %s into memory\n'%file_) ss_snv_lines = gzip.open(file_,'rb').readlines() filesdict[file_]['type']='snv' if filessnvcount==1: filesnvsize1=os.path.getsize(file_) totalsnvlines.extend(ss_snv_lines) headersnv1=ss_snv_lines[0] elif filessnvcount==2: sys.stderr.write('Two SeattleSeqAnnoattion files were found, consolidating ...\n') bigfilesnvsize = max(os.path.getsize(file_),filesnvsize1) totalsnvlines.extend(ss_snv_lines[1:]) headersnv2=ss_snv_lines[0] if not headersnv1==headersnv2: print 'The two snv annotation files do not have the same headers,please use files created by the same SeattleSeq version' htl1 = headersnv1.strip().split('\t') htl2 = headersnv2.strip().split('\t') print 'file1 has %d headers, file2 has %d headers'%(len(htl1), len(htl2)) raise Exception elif filessnvcount>2: raise Exception,'more than 2 SeattleSeq annotation files were found, please use only two files for incremental annotation' snvhtl = ss_snv_lines[0].strip().split('\t') for snvline in ss_snv_lines: if not snvline.startswith('#'): snvtl = snvline.strip().split('\t') snvdict[(snvtl[snvhtl.index('chromosome')],snvtl[snvhtl.index('position')])] = snvtl elif (('SeattleSeqAnnotation' in file_) and ('ss_indel' in file_)): sys.stderr.write('found indel file %s\n'%file_) filesindelcount+=1 filesdict[file_]={} filesdict[file_]['size']=os.path.getsize(file_) filesdict[file_]['type']='indel' ss_indel_lines=gzip.open(file_,'rb').readlines() if filesindelcount==1: fileindelsize1=os.path.getsize(file_) totalindellines.extend(ss_indel_lines) headerindel1=ss_indel_lines[0] elif filesindelcount==2: bigfileindelsize = max(os.path.getsize(file_),fileindelsize1) totalindellines.extend(ss_indel_lines[1:]) headerindel2=ss_indel_lines[0] assert headerindel1==headerindel2,'The two indel annotation files do not have the same headers,please use files created by the same SeattleSeq version' elif filesindelcount>2: raise Exception,'more than 2 SeattleSeq annotation files were found, please use only two files for incremental annotation' indelhtl = ss_indel_lines[0].strip().split('\t') for indelline in ss_indel_lines: if not indelline.startswith('#'): indeltl = indelline.strip().split('\t') indeldict[(indeltl[indelhtl.index('chromosome')], indeltl[indelhtl.index('position')], indeltl[indelhtl.index('sampleAlleles')])] = indeltl for k,v in filesdict.iteritems(): if v['size'] == bigfilesnvsize and v['type']=='snv': fb = gzip.open(k,'wb') fb.writelines(totalsnvlines) fb.close() elif v['size'] == bigfileindelsize and v['type']=='indel': fb = gzip.open(k,'wb') fb.writelines(totalindellines) fb.close() for k,v in filesdict.iteritems(): if v['size']!=bigfilesnvsize and v['type']=='snv' and headersnv2:#delete the smaller file only if the second file header exists os.system('rm %s'%k) elif v['size']!=bigfileindelsize and v['type']=='indel' and headerindel2: #delete the smaller file only if the second file header exists os.system('rm %s'%k) sys.stderr.write('consolidation of seattleseq annotation files is complete, starting to read input file\n') #for tl in [x.strip().split('\t') for x in mylines[1:]]: linecounter=0 inthemasterfilecounter=0 for line in myfile: linecounter+=1 tl = line.strip().split('\t') inserted=None deleted=None # if tl[htl.index('hg19LeftFlank')]=='207881485': # print tl[htl.index('ref_allele')] # print len(tl[htl.index('ref_allele')]) # print "'" in tl[htl.index('ref_allele')] # print tl[htl.index('var_allele')] # print len(tl[htl.index('var_allele')]) # print type(tl[htl.index('var_allele')]) # raise Exception if tl[htl.index('hg19LeftFlank')]!='-': flag=0 if (len(tl[htl.index('var_allele')]) == 1 and tl[htl.index('var_allele')] != '-' and tl[htl.index('var_allele')] != '*' and tl[htl.index('ref_allele')] in ['A','T', 'C','G']): if (tl[htl.index('Chr')].replace('chr',''),str(int(tl[htl.index('hg19LeftFlank')])+1)) in snvdict: inthemasterfilecounter+=1 #print 'I found the position in the ss_snv file, passing' #time.sleep(1) else: #print (tl[htl.index('Chr')].replace('chr',''),str(int(tl[htl.index('hg19LeftFlank')])+1)) #time.sleep(1) outfile_snv.write('\t'.join( [tl[htl.index('Chr')], str(int(tl[htl.index('hg19LeftFlank')])+1), tl[htl.index('var_allele')]] )+'\n' ) snpcounter+=1 flag=1 # the location or position in the seattle seq file if the left flank position of the indel #in other words the positions one base before the indel elif "'" in tl[htl.index('ref_allele')]: if (tl[htl.index('Chr')].replace('chr',''), tl[htl.index('hg19LeftFlank')], '+%s:0/0'%tl[htl.index('var_allele')]) in indeldict: #print 'I found this indel in the dictioanry' pass else: outfile_indel.write('\t'.join( [tl[htl.index('Chr')], tl[htl.index('hg19LeftFlank')], tl[htl.index('hg19LeftFlank')], '+%s:0/0'%tl[htl.index('var_allele')]] )+'\n' ) indelcounter+=1 flag=2 elif (tl[htl.index('var_allele')]!="''" and len(tl[htl.index('ref_allele')]) < len(tl[htl.index('var_allele')])): inserted = tl[htl.index('var_allele')].replace(tl[htl.index('ref_allele')],'',1) if (tl[htl.index('Chr')].replace('chr',''), tl[htl.index('hg19LeftFlank')], '+%s:0/0'%inserted) in indeldict: #print 'I found this indel in the dictioanry' pass else: outfile_indel.write('\t'.join( [tl[htl.index('Chr')], tl[htl.index('hg19LeftFlank')], tl[htl.index('hg19LeftFlank')], '+%s:0/0'%inserted])+'\n') indelcounter+=1 flag=3 elif "'" in tl[htl.index('var_allele')]: if (tl[htl.index('Chr')].replace('chr',''), tl[htl.index('hg19LeftFlank')], '-%s:0/0'%tl[htl.index('ref_allele')]) in indeldict: #print 'I found this indel in the dictioanry' pass else: outfile_indel.write('\t'.join( [tl[htl.index('Chr')], tl[htl.index('hg19LeftFlank')], str(int(tl[htl.index('hg19LeftFlank')]) + len(tl[htl.index('ref_allele')])), '-%s:0/0'%tl[htl.index('ref_allele')]] )+'\n' ) indelcounter+=1 flag=4 elif (tl[htl.index('var_allele')]!="''" and len(tl[htl.index('var_allele')]) < len(tl[htl.index('ref_allele')])): deleted=tl[htl.index('ref_allele')].replace(tl[htl.index('var_allele')],'',1) if (tl[htl.index('Chr')].replace('chr',''), tl[htl.index('hg19LeftFlank')], '-%s:0/0'%deleted) in indeldict: #print 'I found this indel in the dictioanry' inthemasterfilecounter+=1 else: outfile_indel.write('\t'.join( [tl[htl.index('Chr')], tl[htl.index('hg19LeftFlank')], str(int(tl[htl.index('hg19LeftFlank')])+len(tl[htl.index('var_allele')])), '-%s:0/0'%deleted])+'\n') indelcounter+=1 flag=5 #if tl[htl.index('hg19LeftFlank')]=='207881485': # print flag outfile_snv.close() outfile_indel.close() sys.stderr.write ('total number of lines processed = %s \n'%linecounter) sys.stderr.write('number of exported snps %s \n'%snpcounter) sys.stderr.write('number of exported indels %s \n'%indelcounter) sys.stderr.write('number of variants not exported is %s \n'%(linecounter - (snpcounter+indelcounter+1))) sys.stderr.write('%s of these not exported were already in the masterfile \n'%(inthemasterfilecounter)) def check_varsifter_file_format(input_file): flag = True myfile =open(input_file) hline = myfile.readline() htl = hline.strip().split('\t') assert '\t' in hline,('The file is not tab delimited, please see varmd file specifications') myvarfile = varfile(htl) patient_pos = myvarfile.first_sample_index assert isinstance(patient_pos,int), 'Returned patient position is not a number\ check your cohort(allfams) file' not_found_list=[] csnum=801 #number of Clinseq samples used in diferent varsiftr headers varsifter_format = ['Index','Chr','LeftFlank','RightFlank','ref_allele','muttype','var_allele','type', 'Gene_name','loc_type','consequence', 'CS_%sc_genotypes'%csnum, 'CS_%sc_homref'%csnum, 'CS_%sc_het'%csnum, 'CS_%sc_homvar'%csnum, 'CS_%sc_hetnonref'%csnum, 'CS_%sc_other'%csnum, 'CS_%sc_refallele'%csnum, 'CS_%sc_varallele'%csnum, 'CS_%sfreq_homref'%csnum, 'CS_%sfreq_het'%csnum, 'CS_%sfreq_homvar'%csnum, 'CS_%sfreq_hetnonref'%csnum, 'CS_%sfreq_refallele'%csnum, 'CS_%sfreq_varallele'%csnum, 'CS_%sref_is_minor'%csnum, 'CS_%sc_major'%csnum, 'CS_%sc_minor'%csnum, 'CS_%smaf'%csnum, 'CS_%schisquare'%csnum, 'transcript', 'ref_aa', 'var_aa', 'aa_pos', 'CDPred_score', 'dbID', 'HGMDids', 'HGMDdisease', 'HGMDtags', 'HGMD_HGVS', 'REF_MATCHES_HGMD_DISEASE_ALLELE', 'HGMD_ALLELES_in_GENOME_STRAND', 'HGMDinGene', '1K_total_depth_at_site', '1K_rms_mapping_quality', '1K_#samples_with_covg', '1K_#alleles_in_samples_with_covg', '1K_alt_allele_cnts_in_samples_with_covg', '1K_alt_allele_freqs', '1K_allele_balance_in_hets'] for item in varsifter_format: found = 0 for x in htl: if item == x: found = 1 break if found == 0: not_found_list.append(item) for item in htl: if item.startswith('UDP') and not (item.endswith('.NA') or item.endswith('.NA.score') or item.endswith('.NA.coverage')): flag = False print item if not_found_list: print 'Following items are not found in the input file, please make sure the file is in the right format' for item in not_found_list: print item flag = False return flag def fileslist(filesdir,myext,myexcludeinfile,myexcludeinroot,matchbase=''): """ given a directory to look into "filesdir", an extention to match it returns a list with """ verbose=0 if verbose: print 'filesdir',filesdir print 'myext',myext print 'myexcludeinfile',myexcludeinfile print 'myexcludeinroot',myexcludeinroot files1=[] #print os.path.join(filesdir,'*/*.bam') #for file in glob.glob('~/../*.bam'): #print file #time.sleep(1) for root,dir,files in os.walk(filesdir): if files: for file in files: #if os.path.splitext(file)[1]=='%s'%myext and '%s'%myexcludeinfile not in file and '%s'%myexcludeinroot not in root: if os.path.splitext(file)[1] == myext and myexcludeinroot not in root: exclude_flag =0 for x in myexcludeinfile: if x in file: exclude_flag=1 if not exclude_flag: if matchbase: if re.search(r'%s[0-9]*'%matchbase,os.path.splitext(file)[0]): #print '%s/%s'%(root,file) files1.append('%s/%s'%(root,file)) else: pass #print 'regular expression pattern not matched' #print 'root 2',root,'file 2',file #time.sleep(0.2) else: if verbose:print '*** root:',root,'*** file:',file print '%s/%s'%(root,file) files1.append('%s/%s'%(root,file)) #print 'root',root;print '\ndir',dir;print '\nfiles',files #time.sleep(1) #print 'from function fileslist files1 is ',files1, len(files1) #raise Exception return files1 def varmd2vcf(): """ creates a vcf file from the varsifter/varmd interval format """ format = check_varsifter_file_format() if format == True: hgref ={chr:{} for chr in chrs} myfasta = open(args.reference).readlines() counter=0 for line in myfasta[:10000]: counter+=1 if line.startswith('>'): basepos=0 chr = line.strip().replace('>','') #print chr else: hgref[chr][(basepos+1,basepos+50)] = line.strip() basepos+=50 #for k,v in hgref.iteritems(): # print k # print v if counter %100000 == 0: print counter print hgref['chr1'][(51,100)] def get_bases(key): left = int(key[1]) right= int(key[2]) cdleft = left / 50 cdright = right / 50 if cdleft == 0 and cdright == 0: return hgref[key[0]][(key[1],key[2])] elif cdleft == 0 and cdright > 0: first = hgref[key[0]][(key[1],50)] print 'first', first times = ((right - left ) /50 ) + 1 for time_ in xrange(times): print ((left/50)+1)*50 , (((left/50)+1)*50 )+50 first = hgref[key[0]][(key[1],50)] else: return False a = get_bases(['chr1', 1,50]) print a #for k,v in hgref.iteritems(): # for k1,v1 in v.iteritems(): # print v1[:300] # time.sleep(2) fw = open(args.output_file, 'w') fw.write('##fileformat=VCFv4.0\n') fw.write('##fileDate=%s\n'%time.ctime()) fw.write('##source=VAR-MD\n') fw.write('##reference=hg19-liftOver\n') fw.write('##INFO=\n') fw.write('\t'.join(['CHROM','POS','ID','REF','ALT', 'QUAL', 'FILTER', 'INFO'])+'\n') myfile =open(args.input_file) hline = myfile.readline() assert '\t' in hline,('The file is not tab delimited, please see varmd file specifications') htl = hline.strip().split('\t') counter=0 for line in myfile: counter+=1 if counter >100: break tl = line.strip().split('\t') CHR = tl[htl.index('Chr')].replace('Chr','') if chr == "M": chr == "MT" ID = '.' REF = tl[htl.index('ref_allele')] QUAL = "." FILTER = "." INFO = '.' if tl[htl.index('muttype')]=='SNP': POS = str(int(tl[htl.index('LeftFlank')])+1) ALT = tl[htl.index('var_allele')] elif tl[htl.index('muttype')]=='INDEL': pass raise Exception POS = str(int(tl[htl.index('LeftFlank')])+1) ALT = '' if "'" in tl[htl.index('ref_allele')]: outfile_indel.write('\t'.join( [tl[htl.index('Chr')], tl[htl.index('hg19LeftFlank')], tl[htl.index('hg19LeftFlank')], '+%s:0/0'%tl[htl.index('var_allele')]] )+'\n' ) indelcounter+=1 flag=2 elif (tl[htl.index('var_allele')]!="''" and len(tl[htl.index('ref_allele')]) < len(tl[htl.index('var_allele')])): inserted=tl[htl.index('var_allele')].replace(tl[htl.index('ref_allele')],'',1) if (tl[htl.index('Chr')].replace('chr',''), tl[htl.index('hg19LeftFlank')], '+%s:0/0'%inserted) in indeldict: #print 'I found this indel in the dictioanry' pass else: outfile_indel.write('\t'.join( [tl[htl.index('Chr')], tl[htl.index('hg19LeftFlank')], tl[htl.index('hg19LeftFlank')], '+%s:0/0'%inserted])+'\n') indelcounter+=1 flag=3 elif "'" in tl[htl.index('var_allele')]: if (tl[htl.index('Chr')].replace('chr',''), tl[htl.index('hg19LeftFlank')], '-%s:0/0'%tl[htl.index('ref_allele')]) in indeldict: #print 'I found this indel in the dictioanry' pass else: outfile_indel.write('\t'.join( [tl[htl.index('Chr')], tl[htl.index('hg19LeftFlank')], str(int(tl[htl.index('hg19LeftFlank')]) + len(tl[htl.index('ref_allele')])), '-%s:0/0'%tl[htl.index('ref_allele')]] )+'\n' ) indelcounter+=1 flag=4 elif (tl[htl.index('var_allele')]!="''" and len(tl[htl.index('var_allele')]) < len(tl[htl.index('ref_allele')])): deleted=tl[htl.index('ref_allele')].replace(tl[htl.index('var_allele')],'',1) if (tl[htl.index('Chr')].replace('chr',''), tl[htl.index('hg19LeftFlank')], '-%s:0/0'%deleted) in indeldict: #print 'I found this indel in the dictioanry' inthemasterfilecounter+=1 else: outfile_indel.write('\t'.join( [tl[htl.index('Chr')], tl[htl.index('hg19LeftFlank')], str(int(tl[htl.index('hg19LeftFlank')])+len(tl[htl.index('var_allele')])), '-%s:0/0'%deleted])+'\n') indelcounter+=1 fw.write('\t'.join([CHR, POS, ID, REF, ALT, QUAL, FILTER, INFO])+'\n') else: raise Exception def split_by_chrs(): myfile = open(args.input_file) hline = myfile.readline() htl = hline.strip().split('\t') outfiles ={} for chr in chrs: outfiles[chr]= open('%s.%s'%(args.input_file, chr), 'w') outfiles[chr].write(hline) myfile = open(args.input_file) myfile.readline() for line in myfile: tl= line.strip().split('\t') outfiles[tl[htl.index('Chr')]].write(line) def combine_by_chrs(): myfile = open(args.output_file) outfiles ={} for chr in chrs: outfiles[chr]= open('%s.%s'%(args.input_file.replace('_dbsnp',''), chr)) hline = outfiles['chr1'].readline() myfile.write(hline) for line in myfile: tl= line.strip().split('\t') outfiles[tl[htl.index('Chr')]].write(line) if __name__=="__main__": #this token is to decide if total time it took for the program to run would be printed or not. arguments() mycfg = config(args.config_file) engine = create_engine(mycfg.dbconnect, pool_recycle=600) #pool_recycle values are in seconds metadata = MetaData(bind=engine) if args.verbose: print 'Function=',args.function_name print 'Arguments=',args.args if not os.path.exists(args.config_file): print """Configuration file was not found at %s Please make sure the configuration file is at the same directory as varpy , the path is provided with --config and that it contains correct data"""%args.config_file sys.exit() if args.downdb != '': downdb() if args.function_name == 'usage' and len(args.args) == 0: usage() endtimeprinttoken=0 elif args.function_name=='split_by_chrs': split_by_chrs() elif args.function_name=='prepare_liftover_input': prepare_liftover_input(args.input_file, args.output_file) elif args.function_name=='add_hg19': #add_hg19(args[0],args[1],args[2],args[3]) add_hg19(args.input_file, args.liftover, args.output_file) elif args.function_name == 'prep_ss': prep_seattle_seq_input(args.input_file, args.output_snv, args.output_indel, args.config_file) elif args.function_name =='annotate_ss_snv': annotate_seattle_seq_snv(args.input_file, args.output_file) elif args.function_name =='annotate_ss_indel': annotate_seattle_seq_indel(args.input_file, args.output_file) elif args.function_name=='scandirs_cdpred': scandirs_cdpred(args.input_file, args.output_file) elif args.function_name=='annotate_with_dbsnp': annotate_with_dbsnp(args.input_file, args.output_file) elif args.function_name=='annotate_with_ESP': annotate_with_ESP(args.input_file, args.output_file) elif args.function_name=='annotate_with_allfams': annotate_with_allfams(args.input_file,args.output_file) elif args.function_name=='simple_mendelian_filters': simple_mendelian_filters(args.input_file, args.args[0], include_bed_file = args.include_bed_file, linkage_bed_file = args.linkage_bed_file, cnv_bed_file = args.cnv_bed_file) elif args.function_name=='annotate_with_mendelian_filters': annotate_with_mendelian_filters(args.input_file, args.output_file, args.args[0], include_bed_file = args.include_bed_file, linkage_bed_file = args.linkage_bed_file, cnv_bed_file = args.cnv_bed_file) elif args.function_name=='mendelian_filter_com_het_res': mendelian_filter_com_het_res(args.input_file, args.output_file,args.args[0], include_bed_file = args.include_bed_file, linkage_bed_file = args.linkage_bed_file, cnv_bed_file = args.cnv_bed_file) #cProfile.run('split_allfams_byfamily()') elif args.function_name=='create_allsamples_db': create_allsamples_db() elif args.function_name=='create_allsamples_db_mysql': create_allsamples_db_mysql() elif args.function_name=='update_allfams': update_allfams(args[0]) elif args.function_name=='update_local_freq_file': update_local_freq_file(args[0]) elif args.function_name=='dbsnp_vcf2db': dbsnp_vcf2db() elif args.function_name=='ESP_vcf2db': ESP_vcf2db() elif args.function_name=='varmd2vcf': varmd2vcf() elif args.function_name=='annotate_with_interval_data': annotate_with_interval_data() elif args.function_name=='split_allfams_byfamily': split_allfams_byfamily() elif args.function_name=='split_allfams_bysample': split_allfams_bysample() elif args.function_name=='run_analysis': #args 0 path to file #prep_file(args[0],args[1],args[2]) run_analysis(args.config_file) elif args.function_name=='run_analysis_single_file': run_analysis_single_file(args.config_file) elif args.function_name=='fix_nextgene': if args: fix_nextgene(args[0]) else: print 'varmd.py --func fix_nextgene ' else: print 'The function names and arguments you provided were not recognised as legitimate function calls and parameters' usage() endall=time.time() if args.verbose: sys.stdout.write("\nIt took me %.2f minutes to complete the entire operation \n" %((endall-startall)/60))