Without any modifications, targetp works well with small (by modern standards) queries, of less than 2,000 sequences at a time. But becomes glitchy when running with larger queries, such as the 30k-100k genes that are typical from a plant transcriptome assembly.
To adapt TargetP for larger queries, I wrote a Python script that acts as a wrapper around TargetP, called targetp_all.py. The script works by separating the input into smaller subsets of sequences and running those, and combining the output.
Interface is the same as the original program but with a few additional options. The output is somewhat simplified to be in tab-separated format.
It would also be nice to be able to parallelize the execution of TargetP to run on multiple cores at once, but I haven't attempted this yet. I believe that there will be complications involving conflicting temporary files, that may require careful modification of the original source code.
Source code follows. BioPython is a dependency.
#!/usr/bin/python ''' reads a fasta file of peptide sequences. Runs targetp on all of the sequences. Interface is the same as the original program but with a few additional options. Output is somewhat different from normal output from targetp: header information has been reduced to one row. The dash lines separating the sections are removed. The footer is removed. The sequence names are no longer truncated. Runs sequences in small batches to avoid errors associated with running targetp on large files. ''' import sys import os import subprocess import argparse from Bio import SeqIO import re import io def targetp(input, options, path="targetp"): ''' calls targetp and returns a 2-tuple, at index 0 is a list of the fields from the output header, at index 1 is a list of lists of the data from the data lines: ([output header fields],[[data entries],]) path:string pointing to full path of targetp executable (can be omitted if targetp is in the system path) input:fasta formatted string. If predictions come back as all zeros, consider breaking input down into smaller chunks. options: list of command line options for targetp, formatted with hyphens, like "-P", etc. ''' call_list = [path] + options (names,seqs)=zip(*input) input_fasta="" for (i,name) in enumerate(names): input_fasta+=">"+name+"\n"+seqs[i]+"\n" proc_call = subprocess.Popen(call_list, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) (stdout, stderr) = proc_call.communicate(input_fasta.encode("ascii")) stdout = str(stdout) out_header = () out_data = list() lines = stdout.split("\\n") STATE="header" #header, data, footer div_str = "----------------------------------------------------------------------" for line in lines: line = line.strip() if (line == div_str): if STATE=="header": STATE="data" elif STATE=="data": STATE="footer" elif (STATE=="header") and (len(line) >=4) and (line[0:4] == "Name"): out_header = line.split() elif (STATE=="data"): out_data.append(line.split()) for i,name in enumerate(names): out_data[i][0] = name #out_data = list(zip(names,*list(zip(*out_data))[1:])) return (out_header,out_data) def main(outfile, infile, args): targetp_exec ="targetp" if args.targetp_path: if os.path.isfile(args.targetp_path): targetp_exec = args.targetp_path else: raise ValueError("Invalid path for targetp") call_list = list() if args.P: call_list.append("-P") elif args.N: call_list.append("-N") if args.c: call_list.append("-c") if args.p: call_list.append("-p %f" % args.p) if args.s: call_list.append("-p %f" % args.s) if args.t: call_list.append("-p %f" % args.t) if args.o: call_list.append("-p %f" % args.o) if args.s: call_list.append("-p %f" % args.s) def run_cache(cache): opts = {"input":cache,"options":call_list,"path":targetp_exec} (out_header, out_data) = targetp(**opts) if not run_cache.header_written: outfile.write("\t".join(out_header)+"\n") run_cache.header_written = True for data_line in out_data: outfile.write("\t".join(data_line)+"\n") run_cache.header_written = False cache = list() #list of 2-tuples where the first entry is the sequence name and the second is the aa sequence. num_seqs = 0 for seq in SeqIO.parse(infile, 'fasta'): cache.append((str(seq.name),str(seq.seq))) num_seqs += 1 if num_seqs == args.bin_size: run_cache(cache) cache=list() num_seqs = 0 run_cache(cache) if __name__ == "__main__": parser = argparse.ArgumentParser(description="A wrapper for targetp for processing large fasta files. Outputs to stdout.") parser.add_argument('input', nargs='?', default=None, help='Input fasta file, if none given then read from stdin') group = parser.add_mutually_exclusive_group(required=True) group.add_argument("-P", action='store_true', default=False, help="use plant networks") group.add_argument("-N", action='store_true', default=False, help="use non-plant networks") parser.add_argument("-c", action='store_true', default=False, help="include cleavage site predictions") parser.add_argument('-p', type=float, default=0.00, help="chloroplast prediction cutoff, default 0.00") parser.add_argument('-s', type=float, default=0.00, help="chloroplast prediction cutoff, default 0.00") parser.add_argument('-t', type=float, default=0.00, help="chloroplast prediction cutoff, default 0.00") parser.add_argument('-o', type=float, default=0.00, help="chloroplast prediction cutoff, default 0.00") parser.add_argument('--targetp_path', default=None, help="path to targetp executable, if not in system PATH") parser.add_argument('--bin_size', type=int, default=1000, help="number of sequences to submit to targetp at one time. If this number is too large, the program will fail. If it is too small, the program will run slowly. I find that numbers between 500 and 1500 usually work well, but it may depend on the sizes of the proteins you have.") args = parser.parse_args() if args.bin_size < 1: raise argparse.ArgumentTypeError("bin size must be an integer greater than or equal to 1") #If no output file is specified, output to stdout. outfile = sys.stdout infile = sys.stdin if args.input is not None: infile = open(args.input, 'rU') main(outfile, infile, args) outfile.close()
No comments:
Post a Comment