usage:
tstv <file1.fa> <file2.fa>counts the numbers of transitions (Ts) and transversions (Tv) between two sequences in FASTA format
tstv.cpp
#include <iostream> #include <fstream> #include <string> #include <cstdlib> inline bool is_purine(char c) { return (c=='A') || (c=='G'); } inline bool is_pyrimidine(char c) { return (c=='T') || (c=='C'); } int main(int argc, char *argv[]) { if(argc !=3) { std::cerr << "tstv" << std::endl; return 1; } std::ifstream ifs1(argv[1]); if(!ifs1.is_open()) { std::cerr << "could not open input file " << argv[1] << std::endl; return 1; } std::ifstream ifs2(argv[2]); if(!ifs2.is_open()) { std::cerr << "could not open input file " << argv[2] << std::endl; return 1; } std::string h1, h2; std::getline(ifs1,h1); if(ifs1.fail() || (h1[0] != '>')) { std::cerr << "input file is not a FASTA file: " << argv[1] << std::endl; return 1; } std::getline(ifs2,h2); if(ifs2.fail() || (h2[0] != '>')) { std::cerr << "input file is not a FATA file: " << argv[2] << std::endl; return 1; } std::size_t n_total(0); std::size_t nd(0); std::size_t ns(0); while(true) { char c1(0), c2(0); while(true){ ifs1.get(c1); if(ifs1.eof() || (c1!='\n'))break; } while(true){ ifs2.get(c2); if(ifs2.eof() || (c2!='\n'))break; } if(ifs1.eof()) { if(ifs2.eof()) { break; } else { std::cerr << "premature end of input file 1" << std::endl; return 1; } } else if(ifs2.eof()) { std::cerr << "premature end of input file 2" << std::endl; return 1; } n_total++; c1 = std::toupper(c1); c2 = std::toupper(c2); if((c1 == 'N') || (c2 == 'N'))continue; switch(c1) { case 'A': case 'G': case 'T': case 'C': break; default: std::cerr << "unknown character " << c1 << " found in input file 1" << std::endl; return 1; } switch(c2) { case 'A': case 'G': case 'T': case 'C': break; default: std::cerr << "unknown character " << c2 << " found in input file 2" << std::endl; return 1; } if(c1 == c2)continue; nd++; if(is_purine(c1) && is_purine(c2)) { ns++; continue; } if(is_pyrimidine(c1) && is_pyrimidine(c2))ns++; } std::cout << n_total << '\t' << ns << '\t' << nd-ns << std::endl; return 0; }
usage:
Usage: python extract-on-chr-pos.py [options] <database file> <target file/-> options -v invert selections -c output comment lines (starting with #)extracts lines from 2nd file based on 1st file. It can be used for VCF files, the outputs of
snpexp
, and so on.
#ectract lines based on the 1st (chromosome) and 2nd (position) columns #Assumptions: files have at least 1st column (string) and 2nd column (integer) except those lines start with '#' character import sys, getopt def usage(): print >> sys.stderr, "Usage: python extract-on-chr-pos.py [options]" print >> sys.stderr, "\toptions\n\t-v\tinvert selections\n\t-c\toutput comment lines (starting with #)" def main(): col_chr = 0 col_pos = 1 is_invert_mode = False is_output_comments = False try: opts, args = getopt.getopt(sys.argv[1:],"vc") except getopt.GetoptError as err: print str(err) return 2 if len(args) != 2: usage() return 1 for o,a in opts: if o=="-v": is_invert_mode = True elif o=="-c": is_output_comments = True else: assert False, "undefined option" if args[1] == "-": target = sys.stdin else: target = open(args[1],"r") # construct database chrom_dic = dict({}) with open(args[0],"r") as fin: for line in fin: li = line.rstrip() if len(li) > 0 and li[0]=='#': continue ls = li.split('\t') chrom = ls[col_chr] if not chrom_dic.has_key(chrom): chrom_dic[chrom] = set({}) pos = int(ls[col_pos]) chrom_dic[chrom].add(pos) with target as fin: for line in fin: li = line.rstrip() if len(li) > 0 and li[0]=='#': if is_output_comments: print li continue ls = li.split("\t") chrom = ls[col_chr] pos = int(ls[col_pos]) if not chrom_dic.has_key(chrom): if is_invert_mode: print li continue else: if not pos in chrom_dic[chrom]: if is_invert_mode: print li continue elif not is_invert_mode: print li return 0 if __name__ == '__main__': main()