import optparse
import pysam
import math

#indicates for each chrom the start and end of that chromsome vcfRecord in vcfRecods list
class Region:
	chrom="";
	startIndex=-1;
	endIndex=-1;
	
	def __init__(self, chrom, startIndex, endIndex) :
		self.chrom = chrom;
		self.startIndex = startIndex;
		self.endIndex   = endIndex;
	
	def __str__(self):
		return self.chrom + "\t" + str(self.startIndex) + "\t" + str(self.endIndex);

#Memory efficient version of VCF as we only need chrom, pos, ref, alt
class MyVCF:
	chrom="";
	pos = 0;
	ref = "Z";
	alt = "Z";

	def __init__(self, chrom, pos, ref, alt) :
                self.chrom = chrom;
                self.pos   = pos;
                self.ref   = ref;
		self.alt   = alt;

        def __str__(self):
                return self.chrom + "\t" + str(pos) + "\t" + ref + "\t" + alt ;

#########################################POSSESS VCF FILES
def readRecordVCF(fileName, vcfRecords, vcfChromIndex):
	index = 0;	
	prev_recod_chrom = "";
	vcf_reader = open(fileName, 'r');
	for line in vcf_reader:
		if(line[0] != '#'):
			if line.find('SNP')!=-1:
				data = line.split('\t'); 
				vcfRecords.append( MyVCF( data[0], int(data[1]), str(data[3]), str((data[4])[0])  ) );
				#print data[0], data[1], str(data[3]), str((data[4])[0]);
				if(prev_recod_chrom != data[0]):
					vcfChromIndex.append( Region( data[0], index, index) );
				else:
					vcfChromIndex[-1].endIndex = index;
				prev_recod_chrom = data[0];
				index += 1;
			if (index % 1000==0) :
				print "index=", index; 
	print "FINISH Reading VCF";
			
def findFirstCommonSNP(vcfRecords, vcfChromIndex, read, mateRead, chrom):
	start_search_index = -1;
	end_search_index   = -1;
	read_size = read.qlen;
	mate_end_pos = mateRead.pos + read_size;
	#print chrom;
	for i in range(len(vcfChromIndex)):
		if(vcfChromIndex[i].chrom == chrom):
			start_search_index = vcfChromIndex[i].startIndex;
			end_search_index   = vcfChromIndex[i].endIndex;
	#binary search to find the right range of values
	middle_search_index = int((start_search_index + end_search_index) / 2);
	while(middle_search_index != start_search_index and middle_search_index != end_search_index) :
		if( vcfRecords[middle_search_index].pos <  read.pos ) :
			start_search_index = middle_search_index;
		elif( (vcfRecords[middle_search_index].pos > mate_end_pos)) :
			end_search_index   = middle_search_index;
		elif( (vcfRecords[middle_search_index].pos >=  read.pos) and (vcfRecords[middle_search_index].pos <= mate_end_pos) ) :
			break;
		middle_search_index = int((start_search_index + end_search_index) / 2);
	#find the first position which the SNP is between the pairs of read
	if(middle_search_index != 0) :
		while ( (vcfRecords[middle_search_index].pos >  read.pos) and (vcfRecords[middle_search_index].pos < mate_end_pos)  ) :
			middle_search_index -= 1;
		middle_search_index += 1;

	return middle_search_index;		

def generateMAPFILE(outfileName, vcfRecords) :
	try:
		fileHandler = open(outfileName, "w");
	except:
		print "FAIL to open the outfile " + outfileName + " to generate the map files";
		return -1;		

	count = 0;
	for data in vcfRecords:
		fileHandler.write( data.chrom + "\tSNP" + str(count) + "\t" + 
				   str(data.pos) + "\t" + str(data.pos) +  "\t" + 
				   data.ref + "\t" + data.alt + "\n");
		count = count + 1;
	fileHandler.close();

######################################POSSESS BAM FILES
def strMatchMateVCF(vcfRecords, vcfChromIndex, read, mateRead, samFileHandler) :
	seq   = read.seq;
	pos   = read.pos+1;
	cigar = read.cigarstring;
	mpos  = read.mpos;
	chrom = samFileHandler.getrname(read.rnext)	
	size  = read.qlen;

	mate_seq   = mateRead.seq;
	mate_pos   = mateRead.pos+1;
	mate_cigar = mateRead.cigarstring; 

	heterozygous_snp_count = 0;	

	out_str = "";
	vcfSNPIndex = findFirstCommonSNP(vcfRecords, vcfChromIndex, read, mateRead, chrom);	
	index = vcfSNPIndex;
	if(vcfSNPIndex != -1) :
		#print seq, pos, cigar, chrom, mate_seq, mate_pos, vcfRecords[index].pos ;
		while( index < len(vcfRecords) and vcfRecords[index].pos < pos + size  ):
			#print pos, vcfRecords[index].pos, seq[vcfRecords[index].pos-pos], vcfRecords[index].ref, vcfRecords[index].alt;
			if(seq[vcfRecords[index].pos-pos] == vcfRecords[index].ref  ):
				out_str += vcfRecords[index].ref;
				heterozygous_snp_count += 1;
			elif(seq[vcfRecords[index].pos-pos] == vcfRecords[index].alt ):
				out_str += vcfRecords[index].alt;
				heterozygous_snp_count += 1;
			else:
				out_str += '-';
			index = index + 1;

		while( index < len(vcfRecords) and vcfRecords[index].pos >= pos + size and vcfRecords[index].pos < mate_pos ):
			out_str += "-";
			index = index + 1;
		#print index;
		while( index < len(vcfRecords) and vcfRecords[index].pos < mate_pos + size  ):
			#print mate_pos, vcfRecords[index].pos, mate_seq[vcfRecords[index].pos-mate_pos], vcfRecords[index].ref, vcfRecords[index].alt
			if(mate_seq[ vcfRecords[index].pos-mate_pos] == vcfRecords[index].ref ):
                                out_str += vcfRecords[index].ref;
				heterozygous_snp_count += 1;
                        elif(mate_seq[ vcfRecords[index].pos-mate_pos] == vcfRecords[index].alt ):
                                out_str += vcfRecords[index].alt;
				heterozygous_snp_count += 1;
                        else:
                                out_str += '-';
			index = index + 1;	
	
	if( heterozygous_snp_count >= 2 ) :
		out_str = str(vcfSNPIndex) + "\t" + out_str + "\n";
	else :
		out_str = "";
	return out_str;

def generateSEQFILE(bamFile, outfileName, vcfRecords, vcfChromIndex):
	try:
                file_handler = open(outfileName, "w");
        except:
                print "FAIL to open the outfile " + outfileName + " to generate the map files";
                return -1;
	
	sam_file = pysam.Samfile(bamFile, "rb" )	
	#print vcfRecords[vcfChromIndex[0].startIndex].pos, vcfRecords[vcfChromIndex[0].endIndex-1].pos;	
	#print "CHROM", vcfChromIndex[0].chrom;
	for alignedread in sam_file.fetch(vcfChromIndex[0].chrom, vcfRecords[vcfChromIndex[0].startIndex].pos, vcfRecords[vcfChromIndex[0].endIndex-1].pos) :
		read = alignedread;
		if(read.is_paired and read.is_proper_pair and  not(read.is_duplicate)) :
			mate_read = sam_file.mate(read);
			if(mate_read.pos > read.pos) :
				out_put = strMatchMateVCF(vcfRecords, vcfChromIndex, read, mate_read, sam_file);
				if(out_put == ""):
					continue;
				else:
					file_handler.write(out_put);
	sam_file.close();
	file_handler.close();


def main(parser):
	
	records = list();
	vcfChromIndex = list();
	
	(options, args) = parser.parse_args();
	vcfFileName = options.vcfFile;
	bamFileName = options.bamFile;
	outFileName = options.outFile;

	readRecordVCF(vcfFileName, records, vcfChromIndex);
	generateMAPFILE(outFileName+".map", records);
	for i in range(len(vcfChromIndex)):
		print vcfChromIndex[i];
	
	generateSEQFILE(bamFileName, outFileName+".seq", records, vcfChromIndex);


if __name__ == "__main__":
        parser = optparse.OptionParser("usage: %prog [options] ")
        parser.add_option("-o", "--out", dest="outFile",
                default="out", type="string",
                help="specify the output file")
        parser.add_option("-v", "--vcf", dest="vcfFile", default="",
                type="string", help="the name of the VCF file");
	parser.add_option("-b", "--bam", dest="bamFile", default="",
                type="string", help="the name of the BAM file");

	main(parser);