import optparse import pysam import math #indicates for each chrom the start and end of that chromsome vcfRecord in vcfRecods list class Region: chrom=""; startIndex=-1; endIndex=-1; def __init__(self, chrom, startIndex, endIndex) : self.chrom = chrom; self.startIndex = startIndex; self.endIndex = endIndex; def __str__(self): return self.chrom + "\t" + str(self.startIndex) + "\t" + str(self.endIndex); #Memory efficient version of VCF as we only need chrom, pos, ref, alt class MyVCF: chrom=""; pos = 0; ref = "Z"; alt = "Z"; def __init__(self, chrom, pos, ref, alt) : self.chrom = chrom; self.pos = pos; self.ref = ref; self.alt = alt; def __str__(self): return self.chrom + "\t" + str(pos) + "\t" + ref + "\t" + alt ; #########################################POSSESS VCF FILES def readRecordVCF(fileName, vcfRecords, vcfChromIndex): index = 0; prev_recod_chrom = ""; vcf_reader = open(fileName, 'r'); for line in vcf_reader: if(line[0] != '#'): if line.find('SNP')!=-1: data = line.split('\t'); vcfRecords.append( MyVCF( data[0], int(data[1]), str(data[3]), str((data[4])[0]) ) ); #print data[0], data[1], str(data[3]), str((data[4])[0]); if(prev_recod_chrom != data[0]): vcfChromIndex.append( Region( data[0], index, index) ); else: vcfChromIndex[-1].endIndex = index; prev_recod_chrom = data[0]; index += 1; if (index % 1000==0) : print "index=", index; print "FINISH Reading VCF"; def findFirstCommonSNP(vcfRecords, vcfChromIndex, read, mateRead, chrom): start_search_index = -1; end_search_index = -1; read_size = read.qlen; mate_end_pos = mateRead.pos + read_size; #print chrom; for i in range(len(vcfChromIndex)): if(vcfChromIndex[i].chrom == chrom): start_search_index = vcfChromIndex[i].startIndex; end_search_index = vcfChromIndex[i].endIndex; #binary search to find the right range of values middle_search_index = int((start_search_index + end_search_index) / 2); while(middle_search_index != start_search_index and middle_search_index != end_search_index) : if( vcfRecords[middle_search_index].pos < read.pos ) : start_search_index = middle_search_index; elif( (vcfRecords[middle_search_index].pos > mate_end_pos)) : end_search_index = middle_search_index; elif( (vcfRecords[middle_search_index].pos >= read.pos) and (vcfRecords[middle_search_index].pos <= mate_end_pos) ) : break; middle_search_index = int((start_search_index + end_search_index) / 2); #find the first position which the SNP is between the pairs of read if(middle_search_index != 0) : while ( (vcfRecords[middle_search_index].pos > read.pos) and (vcfRecords[middle_search_index].pos < mate_end_pos) ) : middle_search_index -= 1; middle_search_index += 1; return middle_search_index; def generateMAPFILE(outfileName, vcfRecords) : try: fileHandler = open(outfileName, "w"); except: print "FAIL to open the outfile " + outfileName + " to generate the map files"; return -1; count = 0; for data in vcfRecords: fileHandler.write( data.chrom + "\tSNP" + str(count) + "\t" + str(data.pos) + "\t" + str(data.pos) + "\t" + data.ref + "\t" + data.alt + "\n"); count = count + 1; fileHandler.close(); ######################################POSSESS BAM FILES def strMatchMateVCF(vcfRecords, vcfChromIndex, read, mateRead, samFileHandler) : seq = read.seq; pos = read.pos+1; cigar = read.cigarstring; mpos = read.mpos; chrom = samFileHandler.getrname(read.rnext) size = read.qlen; mate_seq = mateRead.seq; mate_pos = mateRead.pos+1; mate_cigar = mateRead.cigarstring; heterozygous_snp_count = 0; out_str = ""; vcfSNPIndex = findFirstCommonSNP(vcfRecords, vcfChromIndex, read, mateRead, chrom); index = vcfSNPIndex; if(vcfSNPIndex != -1) : #print seq, pos, cigar, chrom, mate_seq, mate_pos, vcfRecords[index].pos ; while( index < len(vcfRecords) and vcfRecords[index].pos < pos + size ): #print pos, vcfRecords[index].pos, seq[vcfRecords[index].pos-pos], vcfRecords[index].ref, vcfRecords[index].alt; if(seq[vcfRecords[index].pos-pos] == vcfRecords[index].ref ): out_str += vcfRecords[index].ref; heterozygous_snp_count += 1; elif(seq[vcfRecords[index].pos-pos] == vcfRecords[index].alt ): out_str += vcfRecords[index].alt; heterozygous_snp_count += 1; else: out_str += '-'; index = index + 1; while( index < len(vcfRecords) and vcfRecords[index].pos >= pos + size and vcfRecords[index].pos < mate_pos ): out_str += "-"; index = index + 1; #print index; while( index < len(vcfRecords) and vcfRecords[index].pos < mate_pos + size ): #print mate_pos, vcfRecords[index].pos, mate_seq[vcfRecords[index].pos-mate_pos], vcfRecords[index].ref, vcfRecords[index].alt if(mate_seq[ vcfRecords[index].pos-mate_pos] == vcfRecords[index].ref ): out_str += vcfRecords[index].ref; heterozygous_snp_count += 1; elif(mate_seq[ vcfRecords[index].pos-mate_pos] == vcfRecords[index].alt ): out_str += vcfRecords[index].alt; heterozygous_snp_count += 1; else: out_str += '-'; index = index + 1; if( heterozygous_snp_count >= 2 ) : out_str = str(vcfSNPIndex) + "\t" + out_str + "\n"; else : out_str = ""; return out_str; def generateSEQFILE(bamFile, outfileName, vcfRecords, vcfChromIndex): try: file_handler = open(outfileName, "w"); except: print "FAIL to open the outfile " + outfileName + " to generate the map files"; return -1; sam_file = pysam.Samfile(bamFile, "rb" ) #print vcfRecords[vcfChromIndex[0].startIndex].pos, vcfRecords[vcfChromIndex[0].endIndex-1].pos; #print "CHROM", vcfChromIndex[0].chrom; for alignedread in sam_file.fetch(vcfChromIndex[0].chrom, vcfRecords[vcfChromIndex[0].startIndex].pos, vcfRecords[vcfChromIndex[0].endIndex-1].pos) : read = alignedread; if(read.is_paired and read.is_proper_pair and not(read.is_duplicate)) : mate_read = sam_file.mate(read); if(mate_read.pos > read.pos) : out_put = strMatchMateVCF(vcfRecords, vcfChromIndex, read, mate_read, sam_file); if(out_put == ""): continue; else: file_handler.write(out_put); sam_file.close(); file_handler.close(); def main(parser): records = list(); vcfChromIndex = list(); (options, args) = parser.parse_args(); vcfFileName = options.vcfFile; bamFileName = options.bamFile; outFileName = options.outFile; readRecordVCF(vcfFileName, records, vcfChromIndex); generateMAPFILE(outFileName+".map", records); for i in range(len(vcfChromIndex)): print vcfChromIndex[i]; generateSEQFILE(bamFileName, outFileName+".seq", records, vcfChromIndex); if __name__ == "__main__": parser = optparse.OptionParser("usage: %prog [options] ") parser.add_option("-o", "--out", dest="outFile", default="out", type="string", help="specify the output file") parser.add_option("-v", "--vcf", dest="vcfFile", default="", type="string", help="the name of the VCF file"); parser.add_option("-b", "--bam", dest="bamFile", default="", type="string", help="the name of the BAM file"); main(parser);