Small utilities created by Expo70 for NGS analysis

tstv

usage:

tstv <file1.fa> <file2.fa>

counts the numbers of transitions (Ts) and transversions (Tv) between two sequences in FASTA format

tstv.cpp

#include <iostream>
#include <fstream>
#include <string>
#include <cstdlib>

inline bool is_purine(char c)     { return (c=='A') || (c=='G'); }
inline bool is_pyrimidine(char c) { return (c=='T') || (c=='C'); } 

int main(int argc, char *argv[])
{
	if(argc !=3)
	{
		std::cerr << "tstv  " << std::endl;
		return 1;
	}
	
	std::ifstream ifs1(argv[1]);
	if(!ifs1.is_open())
	{
		std::cerr << "could not open input file " << argv[1] << std::endl;
		return 1;
	}
	std::ifstream ifs2(argv[2]);
	if(!ifs2.is_open())
	{
		std::cerr << "could not open input file " << argv[2] << std::endl;
		return 1;
	}
	
	std::string h1, h2;
	
	std::getline(ifs1,h1);
	if(ifs1.fail() || (h1[0] != '>'))
	{
		std::cerr << "input file is not a FASTA file: " << argv[1] << std::endl;
		return 1;
	}
	std::getline(ifs2,h2);
	if(ifs2.fail() || (h2[0] != '>'))
	{
		std::cerr << "input file is not a FATA file: " << argv[2] << std::endl;
		return 1;
	}
	
	std::size_t n_total(0);
	std::size_t nd(0);
	std::size_t ns(0);
	
	while(true)
	{
		char c1(0), c2(0);
		while(true){ ifs1.get(c1); if(ifs1.eof() || (c1!='\n'))break; }
		while(true){ ifs2.get(c2); if(ifs2.eof() || (c2!='\n'))break; }
		
		if(ifs1.eof())
		{
			if(ifs2.eof())
			{
				break;
			}
			else
			{
				std::cerr << "premature end of input file 1" << std::endl;
				return 1;
			}
		}
		else if(ifs2.eof())
		{
			std::cerr << "premature end of input file 2" << std::endl;
			return 1;
		}
		
		n_total++;
		c1 = std::toupper(c1);
		c2 = std::toupper(c2);
		if((c1 == 'N') || (c2 == 'N'))continue;
		 
		switch(c1)
		{
		case 'A':
		case 'G':
		case 'T':
		case 'C':
			break;
		default:
			std::cerr << "unknown character " << c1 << " found in input file 1" << std::endl;
			return 1;
		}
		switch(c2)
		{
		case 'A':
		case 'G':
		case 'T':
		case 'C':
			break;
		default:
			std::cerr << "unknown character " << c2 << " found in input file 2" << std::endl;
			return 1;
		}
		if(c1 == c2)continue;
		nd++;
		if(is_purine(c1) && is_purine(c2))
		{
			ns++;
			continue;
		}
		if(is_pyrimidine(c1) && is_pyrimidine(c2))ns++;
	}
	
	std::cout << n_total << '\t' << ns << '\t' << nd-ns << std::endl;
	return 0;
}

extract-on-chr-pos.py

usage:

Usage: python extract-on-chr-pos.py [options] <database file> <target file/->
	options
	-v	invert selections
	-c	output comment lines (starting with #)

extracts lines from 2nd file based on 1st file. It can be used for VCF files, the outputs of snpexp, and so on.

#ectract lines based on the 1st (chromosome) and 2nd (position) columns
#Assumptions: files have at least 1st column (string) and 2nd column (integer) except those lines start with '#' character
import sys, getopt

def usage():
	print >> sys.stderr, "Usage: python extract-on-chr-pos.py [options]  "
	print >> sys.stderr, "\toptions\n\t-v\tinvert selections\n\t-c\toutput comment lines (starting with #)"

def main():
	col_chr = 0
	col_pos = 1
	is_invert_mode = False
	is_output_comments = False

	try:
		opts, args = getopt.getopt(sys.argv[1:],"vc")
	except getopt.GetoptError as err:
		print str(err)
		return 2

	if len(args) != 2:
		usage()
		return 1

	for o,a in opts:
		if o=="-v":
			is_invert_mode = True
		elif o=="-c":
			is_output_comments = True
		else:
			assert False, "undefined option"
	
	if args[1] == "-":
		target = sys.stdin
	else:
		target = open(args[1],"r")
	
	# construct database
	chrom_dic = dict({})
	with open(args[0],"r") as fin:
		for line in fin:
			li = line.rstrip()
			if len(li) > 0 and li[0]=='#':
				continue
			ls = li.split('\t')
			chrom = ls[col_chr]
			if not chrom_dic.has_key(chrom):
				chrom_dic[chrom] = set({})
			pos = int(ls[col_pos])
			chrom_dic[chrom].add(pos)
	
	with target as fin:
		for line in fin:
			li = line.rstrip()
			if len(li) > 0 and li[0]=='#':
				if is_output_comments:
					print li
				continue
			ls = li.split("\t")
			chrom = ls[col_chr]
			pos = int(ls[col_pos])
			if not chrom_dic.has_key(chrom):
				if is_invert_mode:
					print li
				continue
			else:
				if not pos in chrom_dic[chrom]:
					if is_invert_mode:
						print li
					continue
				elif not is_invert_mode:
					print li
	return 0						

if __name__ == '__main__':
	main()