#!/usr/bin/perl # # Script to convert raw trace files into dbEST submission # files, using phred and cross_match #See trace2dbest documentation for more details #last update 18/11/04 Alasdair Anthony #Author - Alasdair Anthony, University of Edinburgh #this verison - 2.1.1 #**edited by AA 19/8 to include option of alternative ecoli.seq file #**edited by AA 18/11 to fix -1 in substr line ~976 # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # use warnings; use Term::ANSIColor; use File::Path; #for rmtree use File::Copy; use Cwd; #see sub storependingEST use Term::ReadLine; #makes user input a bit more friendly use strict; my $ESTfileflag=0; my $Pubfileflag=0; my $Libfileflag=0; my $Contfileflag = 0; my $dirflag=0; my $psendmail =1; my $pblastall =1; my $pblastcl3=1; #allows bits of trace2dbest to be missed out if certain prog not in PATH my $no_vect =1; #flag to indicate if user can see their vector in vector.seq my $trace_dir_flag =1; #flag to allow returns to start of trace directory section my $blast = 0; my $blast_save =0; my $flag =1; #general use flag, generally used for while loops enclosing print questions my @dirlist=("fasta","phd_dir","qual","scf","subfiles","fastafiles", "blast_reports", "partigene", "process", "process/traceinfo"); my @progs = ("phred", "cross_match", "blastcl3", "blastall", "sendmail"); my $basedir = "/usr/software/trace2dbest/trace2dbest"; my $Lib_file = "$basedir/Libfile.db"; my $Pub_file = "$basedir/Pubfile.db"; my $Cont_file = "$basedir/Contfile.db"; my $EST_file = "$basedir/ESTfile.db"; #declare @ my (@stats, @ESTfile_data, @Libfile_data, @Pubfile_data, @Contfile_data, @blastdb); #declare $ my ($less_flag, $dir, $file_type, $type_id, $input, $seq_primer, $pcrf_primer, $pcrr_primer, $p_end, $comment, $public_date, $adapter, $vector_file, $scheme, $tracedir, $library, $high_qual_cutoff, $vminmatch, $vminscore,$eminmatch, $eminscore, $poly_num, $sl_flag, $sl_seq, $blastdata, $blast_database, $ecoli_file,$ecoli_cross, $cont_name, $lib_name, $pub_cit, $lib_descr, $final_file_num, $dbESTsubmission, $species, $save_location, $no_lib, $no_cont, $no_pub, $chim_flag, $chim_flag_e, $bits_cutoff, $read_gnu); #declare % for(my $i=0;$i<6;$i++) { #set stats to 0 $stats[$i]=0; } if (-f "/usr/bin/less"){ $less_flag = 1; } else { $less_flag = 0; } #this is the only way i could figure of getting the user's home directory my $currdir = getcwd(); chdir; #changes to the users home directory my $homedir = getcwd(); chdir $currdir; #change back to the directory we started in #SECTION 1 - prep #Create ReadLine object and set flag. Clear up directoires left after previous usage - - check that phred/cross_match/sendmail/blastcl3/blastall are in users path my $term = new Term::ReadLine 'sample'; $term ->ornaments(0); #stops prompt getting underlined my $attribs = $term->Attribs; $attribs->{completion_entry_function} = $attribs->{filename_completion_function}; if ($term ->ReadLine() =~ /Gnu$/) { #returns the actual package that executes the commands - we need gnu $read_gnu = 1; } else { print "The readline package Term::ReadLine:Gnu was not found.\nInstalling this package will make interaction with trace2dbest more friendly.\nContinuing...\n"; $read_gnu = 0; } foreach $dir (@dirlist) { if(stat("$dir")) { $dirflag=1; last; } } if($dirflag==1) { print colored ("\n#### WARNING ! ####","green bold"); print "\nSome directories/files from previous runs exist. These\n"; print "directories must be removed before trace2dbest can continue.\n"; print "Remove now [y/n]? : "; &query_for_exit(); foreach $dir (@dirlist) { if(stat("$dir")) { print "Remove directory \'$dir\' and its contents ? [y/n] : "; &query_for_exit(); &rmtree($dir); } } } if (-e "logfile") { print "Removing old logfile\n"; unlink "logfile"; } if (-e "phredlogfile") { print "Removing old phred logfile\n"; unlink "phredlogfile"; } if (-e "cmlogfile") { print "Removing old cross_match logfile\n"; unlink "cmlogfile"; } my $log = "logfile"; open(LOG, ">$log"); #check that all the req/optional progs are in the user's path my @PATH=split(":","$ENV{'PATH'}"); #Get users Path foreach my $prog (@progs) { my $found =0; foreach my $path (@PATH) { if (-x "$path/$prog") { $found= 1; last; } } unless ($found) { if ($prog =~ /(?:phred|cross_match)/) { #?: means that the string isn't stored in $1. print "ERROR: $prog was not found in your path. This program is required by trace2dbest.\ntrace2dbest will now exit\n"; print LOG "$prog not in user's PATH\n"; exit(); } else { print "WARNING: $prog was not found in your path. trace2dbest will run without this\nprogram, but you will not be able to use all the functions.\n"; print LOG "$prog not in user's PATH\n"; if ($prog =~ /sendmail/) { #allows auto mail to be missed because sendmail not in path $psendmail=0; } if ($prog =~ /blastall/) { $pblastall=0; } if ($prog =~ /blastcl3/) { $pblastcl3 =0; } } print "Hit return to continue: "; $input =<>; } } #Now check that phred environment variable is set my @env_keys = keys %ENV; my $phred_env_flag =0; foreach my $key (@env_keys) { if ($key eq "PHRED_PARAMETER_FILE") { my $y = $ENV{$key}; unless (-r $y) { #file is readable print "ERROR: The PHRED_PARAMETER_FILE environment variable is set to $y\n"; print "but this file is not readable!\n"; print "This variable must be set properly, please see the phred install instructions\n"; print "trace2dbest will now exit\n"; print LOG "PHRED_PARAMETER_FILE is set to $y but this file is not readable\n"; exit(); } $phred_env_flag = 1; #set flag to say phred param file found ok } } unless ($phred_env_flag) { print "ERROR: The PHRED_PARAMETER_FILE envrionment variable is not set!\n"; print "This variable must be set properly, please see the phred install instructions\n"; print "trace2dbest will now exit\n"; print LOG "PHRED_PARAMETER_FILE is not set.\n"; exit(); } #SECTION 2 - get user input #Present title page. Ask for info to complete Lib, Cont and Pub files - then EST file can be made (using some info from the other files) &print_title(); print colored("\nSection 1 - Lib, Cont, Pub and EST file information\n", "bold"); my $tryagain = 0; #flag so that if users need to 'try again' to get the file then they don't get the header info which they've seen already while($Libfileflag==0) { #get Library file info $file_type = "Library"; $type_id = "TYPE: Lib"; my $answer=&file_choices_page($file_type, $Lib_file, $tryagain); $tryagain =1; if($answer=~ /h/i) { &print_help(); next; } #use less to show trace2dbest.doc if($answer=~ /q/i) { print "Ok, trace2dbest exiting...\nBye \n"; exit(); } #exit program if($answer == 1) { @Libfile_data= &dummyLib_file_data(); $no_lib = 1;} #no Lib file needed so make dummy file with lib name for library field of est file. set flag so we know what to add to sub file elsif($answer == 2) { @Libfile_data=&newLib_file_data(); } #create new data for Lib file else { @Libfile_data=&get_file_data($answer, $Lib_file, $type_id); } #record from Libfile.db to be used if (@Libfile_data) { $Libfileflag =1; foreach my $line (@Libfile_data) { #extract various bit of info for other bits of the process if ($line =~ /^NAME:\s(.*)/) { $lib_name = $1; } if ($line =~ /ORGANISM:\s(.*)/) { $species = $1; } } } } $tryagain = 0; while($Contfileflag==0) { #get Contact file info $file_type = "Contact"; $type_id = "TYPE: Cont"; my $answer=&file_choices_page($file_type, $Cont_file, $tryagain); $tryagain =1; if($answer=~ /h/i) { &print_help(); next; } #use less to show trace2dbest.doc if($answer=~ /q/i) { print "Ok, trace2dbest exiting...\nBye \n"; exit(); } #exit program if($answer == 1) { @Contfile_data= &dummyCont_file_data(); $no_cont = 1;} #no Cont file needed so make dummy file with name for contact field of est file. elsif($answer == 2) { @Contfile_data=&newCont_file_data(); } #create new data for Cont file else { @Contfile_data=&get_file_data($answer, $Cont_file, $type_id); } #record from Contfile.db to be used if (@Contfile_data) { $Contfileflag =1; foreach my $line (@Contfile_data) { if ($line =~ /^NAME:\s(.*)/) { $cont_name = $1; } } } } $tryagain = 0; while($Pubfileflag==0) { #get Publication file info $file_type = "Publication"; $type_id = "TYPE: Pub"; my $answer=&file_choices_page($file_type, $Pub_file, $tryagain); $tryagain =1; if($answer=~ /h/i) { &print_help(); next; } #use less to show trace2dbest.doc if($answer=~ /q/i) { print "Ok, trace2dbest exiting...\nBye \n"; exit(); } #exit program if($answer == 1) { @Pubfile_data= &dummyPub_file_data(); $no_pub = 1;} #if pub file is not to be created. make dummy file with just the title to put in EST file elsif($answer == 2) { @Pubfile_data=&newPub_file_data(); } #create new data for Pub file else { #record from Pubfile.db to be used @Pubfile_data=&get_file_data($answer, $Pub_file, $type_id); #get pub_cit for est file. needs to be done separately from cases where pub file was made if (@Pubfile_data) { #...but we only want to extract this info if a pub file was selected #$Pubfileflag =1; my $flag =0; foreach my $line (@Pubfile_data) { if ($line =~ /AUTHORS:/ || $line =~ /\|\|/) { #need || in case dummpy pub file being used $flag = 0; #the multi lined title has come to an end so stop reading lines to pub-cit } if ($flag == 1) { $pub_cit = $line; chomp $pub_cit; } if ($line =~ /^TITLE:/ ) { $flag = 1; } } } } if (@Pubfile_data) { $Pubfileflag =1; } } chomp $pub_cit; #use species name from lib file to make directory structure do now in case it fails $species =~ s/\s/_/g; #swap spaces for underscores if (-w "/home/db/est_solutions") { unless (-w "/home/db/est_solutions/$species") { mkdir "/home/db/est_solutions/$species", 0777 or die "Can't make directory /home/db/est_solutions/$species\n $!\n"; } unless (-w "/home/db/est_solutions/$species/trace2dbest") { mkdir "/home/db/est_solutions/$species/trace2dbest", 0777 or die "Can't make directory /home/db/est_solutions/$species/trace2dbest\n$!\n"; } $save_location = 1; #so we know where to save the files at the end } else { #set up directory structure in user's home directory unless (-w "$homedir/est_solutions") { mkdir "$homedir/est_solutions", 0755 or die "Can't make directory : $!\n"; } unless (-w "$homedir/est_solutions/$species") { mkdir "$homedir/est_solutions/$species" , 0755 or die "Can't make directory : $!\n"; } unless (-w "$homedir/est_solutions/$species/trace2dbest") { mkdir "$homedir/est_solutions/$species/trace2dbest" , 0755 or die "Can't make directory : $!\n"; } $save_location = 0; } my @EST_file = &EST_file_data(); #print "The EST file looks like this:\n@EST_file\n"; #set variables for final EST submission files foreach my $field (@EST_file) { if ($field =~ /SEQ_PRIMER:\s(.*)/ ) { $seq_primer = $1; #primer name extracted later } elsif ($field =~ /PCR_F:\s(.*)/ ) { $pcrf_primer = $1; }elsif ($field =~ /PCR_B:\s(.*)/ ) { $pcrr_primer = $1; }elsif ($field =~ /P_END:\s(.*)/ ) { $p_end = $1; }elsif ($field =~ /PUBLIC:\s(.*)/ ) { $public_date = $1; }elsif ($field =~ /COMMENT:\s(.*)/ ) { $comment = $1; } } #Section 2 - Processing print colored("\nSection 2 - trace2dbest processing information\n", "bold"); print "\nThe following information is required to allow trace2dbest to process your\ntraces efficiently.\n"; print colored ("\nAdapter","bold"); print "\n"; #on separate line so next question doesn't get bolded $flag =1; while ($flag) { #print "If you would like trace2dbest to trim off an adapter sequence (and everything\nupstream of it), please enter the adapter sequence here or hit\nreturn to continue: "; $adapter = &get_input("If you would like trace2dbest to trim off an adapter sequence (and everything\nupstream of it), please enter the adapter sequence here or hit\nreturn to continue: "); chomp $adapter; #this check has been hashed out so that a regex can be used in the adapter sequence #if ($adapter =~ /[^ACGT]/i) { # print "Please enter a valid DNA sequence.\n"; #} else { $flag = 0; #} } print colored ("\nVector file\n","bold"); while ($no_vect) { print "The default location for the vector.seq file is\n $basedir/vector.seq\n"; $input = &get_input("Hit return to use this file or enter a different path here: "); if ($input) { $vector_file = $input; } else { $vector_file = "$basedir/vector.seq"; } $flag =1; while ($flag) { if (-s $vector_file) { #file exists and has non-zero size $flag =0; } else { print "ERROR: the vector file you have selected does not appear to exist or is empty.\n"; #print "Please enter a different path for the vector.seq file (include 'vector.seq' at the end): "; $vector_file = &get_input("Please enter a different path for the vector.seq file (include 'vector.seq' at the end): "); chomp $vector_file; } } $no_vect=0; #assume vector is in file (but see below) #print "Would you like to see a list of all the vector sequences in\n $vector_file? (y/n): "; $input= &get_input("Would you like to see a list of all the vector sequences in\n $vector_file? (y/n): "); chomp $input; if ($input =~ /y/i) { $no_vect = &Get_vectors($vector_file); #will return 1 if vector not in file } } #extra bit to check that ecoli.seq is in place print colored ("\nE.coli sequence information","bold"); print "\n"; #so that next bit doesn't get bolded #print "Do you want to screen for E.coli sequence in your ESTs? (y/n): "; $input = &get_input("Do you want to screen for E.coli sequence in your ESTs? (y/n): "); chomp $input; if ($input =~ /y/i) { print "Ok, checking for ecoli.seq file..."; if (-s "$basedir/ecoli.seq") { #file exists with non-zero size #$ecoli_file = "$basedir/ecoli.seq"; print "ecoli.seq file found at $basedir/ecoli.seq.\n"; my $input = &get_input ("Use this file (y/n)?: "); if ($input && $input =~ /^y$/i) { #input can be y or nothing $ecoli_file = "$basedir/ecoli.seq"; } else { $ecoli_file = &get_input ("Enter alternative file here:\n"); if (-s $ecoli_file) { print "Ok, using $ecoli_file\n"; } else { print "ERROR! $ecoli_file is empty!\ntrace2dbest will now exit\n"; exit(); } } } else { print "Warning: The file ecoli.seq was not found in its usual location.\n"; my $flag =1; while ($flag) { #print "Please enter the full path to this file (e.g. /usr/software/trace2dbest/)\nor enter 'q' to quit: "; my $input = &get_input("Please enter the full path to this file (e.g. /usr/software/trace2dbest/)\nor enter 'q' to quit: "); chomp $input; $ecoli_file = "$input/ecoli.seq"; if ($input =~ /^q{1}$/i) { print "Ok, trace2dbest will now exit...\n"; exit(); } elsif (-r $ecoli_file) { print "Ok, file found at $ecoli_file\n"; $flag =0; } else { print "$ecoli_file does not\nexist or could not be read, try again.\n"; } } } $ecoli_cross =1; } else { print "Ok, trace2dbest will NOT screen for E.coli sequence.\n"; $ecoli_cross = 0; } print colored ("\nTrace file naming scheme\n","bold"); print "In order for your traces to be processed, the file names must follow one of\nthese schemes:\n"; print "\n1 - NERC Environmental Genomics scheme\n2 - STRESSGENES scheme\n"; #print "\nPlease enter the appropriate number: "; $input = &get_input("\nPlease enter the appropriate number: "); chomp $input; $flag =1; while ($flag) { if ($input !~ /^(1|2)$/) { #print "Please enter 1 or 2: \n"; $input = &get_input("Please enter 1 or 2: \n"); chomp $input; } else { $flag =0; } } $scheme = $input; print colored ("\n\nTrace file directory","bold"); print "\n"; #needed to stop the following question coming up in bold while ($trace_dir_flag) { #print "Please enter the full path of the directory containing the trace files\nto be processed\n"; $input = &get_input("Please enter the full path of the directory containing the trace files\nto be processed\n"); #$input =<>; chomp $input; while ($input =~ /\./) { #check for . usage (invalid) print "The current directory is not a valid option. See trace2dbest documentation\n"; print "for more details.\n"; #print "Please enter a different directory or q to quit trace2dbest.\n"; $input = &get_input("Please enter a different directory or q to quit trace2dbest.\n"); chomp $input; } while (!-R $input) { #print "ERROR: $input is not readable. Please re-enter the trace file\ndirectory path or hit 'q' to quit\n"; $input = &get_input("ERROR: $input is not readable. Please re-enter the trace file\ndirectory path or hit 'q' to quit\n"); chomp $input; if ($input =~ /^q{1}$/i) { print "Ok, trace2dbest will now exit.\n"; exit(); } while ($input =~ /\./) { #check for . usage (invalid) print "The current directory is not a valid option. See trace2dbest documentation\n"; print "for more details.\n"; #print "Please enter a different directory or q to quit trace2dbest.\n"; $input = &get_input("Please enter a different directory or q to quit trace2dbest.\n"); chomp $input; } } $tracedir = $input; #get total files in dir and total with library identifer opendir(DIR, "$tracedir") or die "$tracedir could not be opened!\n"; my @file_num = grep !/^\.\.?\z/ , readdir DIR; #don't get . and .. my $file_nums=@file_num; my @trace_num; #declare trace_num close DIR; #don't know why DIR has to be closed and then re=opened opendir(DIR, "$tracedir")or die "$tracedir could not be opened!\n"; if ($scheme == 1) { #EG format @trace_num = grep { m/^[A-Za-z]{2}_\w{2,5}_\d{2}[A-Za-z]\d{2}$/ } readdir DIR; } elsif ($scheme ==2) { #SG format @trace_num = grep { m/^[A-Z][a-z][A-Z]{2}[0-9]{2}[a-z][0-9]{2}[a-z][0-9]{2}[a-z][0-9]_[A-Z][a-z]{2}[A-Z][a-z]$/ } readdir DIR; } else { die "Naming scheme outside allowable range!\n"; } my $trace_nums=@trace_num; if ($trace_nums ==0) { #special case for when NO trace files are found #print "\nNo files with the specified naming scheme were found in $tracedir.\nHit return to re-enter the trace file directory details, or q to quit\ntrace2dbest (so you can edit your trace directory): "; $input = &get_input("\nNo files with the specified naming scheme were found in $tracedir.\nHit return to re-enter the trace file directory details, or q to quit\ntrace2dbest (so you can edit your trace directory): "); chomp $input; if ($input =~ /^q{1}$/i) { print "Ok, trace2dbest will now exit...\n"; exit(); } else { next; } } my @nolib; if ($file_nums > $trace_nums) { print "\nWarning: There are $file_nums files in $tracedir, however, only $trace_nums files match\nthe specified naming scheme.\n"; print "The following files in do not match the specified naming scheme: \n\n"; #for the grep operator (cf glob) we can use a regex to match the specified naming scheme. if ($scheme == 1) { #EGformat @nolib = grep { !m/^([A-Za-z]{2}_\w{2,5}_\d{2}[A-Za-z]\d{2})$/} @file_num; #find files that don't match (!m) EG format regex (could have used two foreach loops) } elsif ($scheme ==2) { #SG format @nolib = grep { !m/^([A-Z][a-z][A-Z]{2}[0-9]{2}[a-z][0-9]{2}[a-z][0-9]{2}[a-z][0-9]_[A-Z][a-z]{2}[A-Z][a-z])$/} @file_num; } else { die "Naming scheme outside allowable range!\n"; } my $rejects = join "\n", @nolib; print $rejects; print "\n\nThe above files will be DELETED and not be further processed. "; } elsif ($file_nums == $trace_nums) { print "\nThere are $trace_nums files that match this naming scheme in $tracedir\n"; } else { die "There are $trace_nums trace files but only $file_nums files!\n"; } #print "Is this correct? (y/n): "; $input = &get_input("Is this correct? (y/n): "); chomp $input; if ($input =~ /n/i) { #print "Hit return to re-enter the trace file directory details, or q to quit\ntrace2dbest (so you can edit your trace directory): "; $input = &get_input("Hit return to re-enter the trace file directory details, or q to quit\ntrace2dbest (so you can edit your trace directory): "); chomp $input; if ($input =~ /^q{1}$/i) { print "Ok, trace2dbest will now exit...\n"; exit(); } } else { foreach my $file (@nolib) { my $rm_file = $tracedir . "/" . $file; print "removing $rm_file\n"; unlink ($rm_file) or die "could not unlink $rm_file! Please manually edit your trace directory\n"; #remove any files in trace that don't match naming scheme -brutal. } $trace_dir_flag =0; } $final_file_num = $trace_nums; #save trace nums as global variable for use later. ok to use trace nums because file_num and trace_num should be the same now. } #Section 3 - Ask parameter questions print colored("\nSection 3 - trace2dbest parameters\n", "bold"); print "\nYou now have the opportunity to set the various parameters that control how the\ntraces will be processed.\n"; #print "trace2dbest has default values for all the parameters, to use these defaults\nenter 's' (skip), otherwise hit return to alter parameters: "; $input = &get_input("trace2dbest has default values for all the parameters, to use these defaults\nenter 's' (skip), otherwise hit return to alter parameters: "); chomp $input; if ($input =~ /s/i) { print "Ok, trace2dbest default parameters will be used.\n"; #set defaults $vminmatch = 10; $vminscore =20; $eminmatch=20; $eminscore =30; $high_qual_cutoff =150; $poly_num = 8; } else { print "\nFor each of the parameters, enter the value you wish to use or hit return to\nuse the default shown in brakets().\n"; print colored ("\nphred","bold"); print "\n"; #print "Number of high quality bases required in sequence (150): "; my $flag = "1"; while ($flag) { my $input = &get_input("Number of high quality bases required in sequence (150): "); chomp $input; if ($input) { if ($input =~ /^\d*$/ ) { $high_qual_cutoff = $input; print "High quality cut-off set to $high_qual_cutoff\n"; $flag = 0; } else { print "You must enter a number!\n"; next; } }else { $high_qual_cutoff = 150; print "High quality cut-off of $high_qual_cutoff selected\n"; $flag =0; } } print colored ("\ncross_match\n","bold"); print "cross_match for vector sequence - \n"; $flag = "1"; #flag reused while ($flag) { #print "minmatch (10): "; my $input = &get_input("minmatch (10): "); chomp $input; if ($input) { if ($input =~ /^\d*$/ ) { $vminmatch = $input; print "minmatch set to $vminmatch\n"; $flag = 0; } else { print "You must enter a number!\n"; next; } }else { $vminmatch = 10; print "minmatch value of $vminmatch selected\n"; $flag =0; } } $flag = 1; while ($flag) { #print "minscore (20): "; my $input = &get_input("minscore (20): "); chomp $input; if ($input) { if ($input =~ /^\d*$/ ) { $vminscore = $input; print "minscore set to $vminscore\n"; $flag = 0; } else { print "You must enter a number!\n"; next; } }else { $vminscore = 20; print "minscore value of $vminscore selected\n"; $flag =0; } } if ($ecoli_cross) { print "cross_match for E.coli sequence - \n"; $flag = "1"; while ($flag) { #print "minmatch (20): "; my $input = &get_input("minmatch (20): "); chomp $input; if ($input) { if ($input =~ /^\d*$/ ) { $eminmatch = $input; print "minmatch set to $eminmatch\n"; $flag = 0; } else { print "You must enter a number!\n"; next; } }else { $eminmatch = 20; print "minmatch value of $eminmatch selected\n"; $flag =0; } } $flag = 1; while ($flag) { #print "minscore (30): "; my $input = &get_input("minscore (30): "); chomp $input; if ($input) { if ($input =~ /^\d*$/ ) { $eminscore = $input; print "minscore set to $eminscore\n"; $flag = 0; } else { print "You must enter a number!\n"; next; } }else { $eminscore = 30; print "minscore value of $eminscore selected\n"; $flag =0; } } } print colored ("\nPoly(A) tail","bold"); print "\n"; $flag = 1; while ($flag) { #print "Enter number bases in poly(A) tail (8): "; my $input = &get_input("Enter number bases in poly(A) tail (8): "); chomp $input; if ($input) { #if $input eq "0" then this will not pass and poly a will get set to 8. setting poly(A) to zero is a bad idea if ($input =~ /^\d{1,2}$/ ) { $poly_num = $input; print "poly(A) tail set to $poly_num\n"; $flag=0; } else { print "You must enter a whole number between 1 and 99!\n"; next; } } else { $poly_num = 8; print "poly(A) tail set to $poly_num\n"; $flag=0; } } print colored ("\nSpliced leader 1\n","bold"); print "If you wish to trim the nematode spliced leader sequence, type 'yes', or\n"; print "enter an alternative nucleotide sequence."; #print " If you do not wish to trim any\nspliced leader sequence just hit return: "; $input = &get_input("If you do not wish to trim any spliced leader sequence just hit return: "); chomp $input; if ($input) { if ($input =~ /yes/i) { $sl_flag = 1; $sl_seq = "ggtttaattacccaagtttgag"; print "The nematode SL sequence will be trimmed\n"; } elsif ($input =~ /[acgt]*/) { $sl_flag =1; $sl_seq = $input; print "The sequence $sl_seq will be trimmed\n"; } } else { print "Ok, no SL will be trimmed\n"; $sl_flag = 0; } } #Section 4 - Ask annotation questions print colored("\nSection 4 - Annotation of sequences\n", "bold"); if (!$pblastall && !$pblastcl3) { print "trace2dbest facilitates preliminary annotation in the form of a BLAST hit,\n however neither remote nor local BLAST utilities were found in your PATH.\n"; } else { print "Would you like to add BLAST-based preliminary annotation to the sequences?\n"; #print "Note: this will slow the process considerably (y/n): "; my $input = &get_input("Note: this will slow the process considerably (y/n): "); chomp $input; if ($input !~ /y/i) { print "Ok, no BLAST annotation will be added.\n"; } else { if ($pblastall && $pblastcl3) { print "You have two options:\n\n"; print "\t1 - Remote BLAST using NCBI (max ~400 sequences/day)\n"; print "\t2 - Local BLAST\n\n"; while () { #print "Enter number: "; my $input = &get_input("Enter number: "); chomp $input; if ($input =~ /^1$/) { $blast = 1; print "Ok, remote BLAST will be carried out on the processed sequences.\n"; last; } elsif ($input =~ /^2$/) { $blast = 2; last; } else { print "Please enter 1 or 2\n"; next; } } }elsif ($pblastall==1) { $blast =2 } elsif ($pblastcl3==1) { $blast = 1; print "Ok, remote BLAST will be carried out on the processed sequences.\n"; } else { die "blastall = $pblastall, blastcl3 = $pblastcl3"; } #selection of local blast databse if ($blast ==2) { my $flag =1; while ($flag) { #print "Please enter the name of the directory that holds your BLAST databases\n(e.g. /home/db/blastdb): "; my $input = &get_input("Please enter the name of the directory that holds your BLAST databases\n(e.g. /home/db/blastdb): "); chomp $input; if ($input =~ /^q{1}$/i) { print "Ok, trace2dbest will now exit...\n"; exit (); } if (-d $input && -r $input) { $blastdata =$input; @blastdb = glob ("$blastdata/*.pin"); if (!@blastdb) { print "$blastdata does not appear to have any protein BLAST databases, please try again.\n"; next; #start while loop again now so that flag doesn't get set to 0 } $flag = 0; } else { print "$input is not a directory or could not be read. Please try\nagain or type q to exit.\n"; } } if (@blastdb) { print "The following protein BLAST databases were found-\n"; my $number =1; foreach my $db (@blastdb) { $db =~ s/$blastdata\///; $db =~ s/\.pin//; print "$number - $db\n"; $number++; } $number = $number -1; #get real number of databases my $flag =1; while ($flag) { #print "Select a database by entering its number: "; my $input = &get_input("Select a database by entering its number: "); chomp $input; if ($input !~ /^\s*[0-9]+\s*$/i) { #only numbers and spaces print "Please enter a number between 1 and $number.\n"; } elsif ($input > $number && $input >= 1) { print "Please enter a number between 1 and $number.\n"; } else { $input--; $blast_database = $blastdb[$input]; $blast_database = $blastdata."/".$blast_database; #add path to blast database print "Ok, the database $blast_database will be used\n"; $flag =0; } } } } $flag = 1; while ($flag) { #print "What BLAST bit score cut-off value would you like to use (default 55)? "; $input = &get_input("What BLAST bit score cut-off value would you like to use (default 55)? "); chomp $input; if ($input) { if ($input !~ /^\s*[0-9]+\s*$/) { print "please enter a number.\n"; } else { $bits_cutoff = $input; $flag = 0; } } else { $bits_cutoff = 55; $flag = 0; } } #print "Would you like to save the BLAST reports? (y/n): "; $input = &get_input("Would you like to save the BLAST reports? (y/n): "); chomp $input; if ($input =~ /y/i) { $blast_save =1; } } } #section 5 -trace process print colored("\nSection 5 - Trace processing\n", "bold"); #set up mkdir "subfiles"; mkdir "fasta"; mkdir "qual"; mkdir "phd_dir"; mkdir "scf"; mkdir "fastafiles"; mkdir "partigene"; mkdir "blast_reports"; mkdir "process"; mkdir "process/traceinfo"; #phred print "Running phred (basecalling from raw traces) - please wait..."; #see below (sequence processing) for explaination of phred arguements system("phred -id $tracedir -cd scf -pd phd_dir -qd qual -sd fasta -exit_nomatch -trim \"\" 2>>phredlogfile 1> /dev/null"); #exit_nomatch means phred will exit if the trace chemistry is not found in phredpar.dat if (my @phred_out = glob ("fasta/*.seq")) { my $phred_out_size = @phred_out; if ($phred_out_size == $final_file_num) { print LOG "PHRED: .seq files found in fasta - phred ok.\n"; print "Done\n"; } else { print LOG "\nPHRED WARNING: $final_file_num files with a valid name where passed to phred, but only $phred_out_size files were processed!\n"; my $unphreded = $final_file_num - $phred_out_size; print "\n\t$unphreded of $final_file_num files were not processed by phred. See phredlogfile and logfile for details.\n"; #print "\tEnter 'q' to quit or hit return to continue: "; my $input = &get_input("\tEnter 'q' to quit or hit return to continue: "); chomp $input; if ($input =~ /^q{1}$/i) { print "Ok, trace2dbest will now exit...\n"; exit(); } } $stats[1] = $phred_out_size; } else { print LOG "PHRED: No .seq files found in fasta directory\n"; print colored ("FAILED!\n", "red bold"); print "Check logfile and phredlogfile. trace2dbest will now exit...\n"; exit(); } #cross_match for vector #concatenate the .seq files created by phred into all.seq - makes cross_match quicker system("cat ./fasta/*.seq > all.seq" || die "cat fasta/*.seq failed!\n"); print "Running cross_match (screening for vector sequence) - please wait..."; system("cross_match all.seq $vector_file -minmatch $vminmatch -minscore $vminscore -screen 2>>cmlogfile 1> /dev/null"); if (-s "all.seq.screen") { print LOG "Vector crossmatch: all.seq.screen - vector crossmatch ok.\n"; print "Done\n"; } else { print LOG "Vector crossmatch: No all.seq.screen\n"; print colored ("FAILED!\n", "red bold"); print "Check logfile and cmlogfile. trace2dbest will now exit...\n"; exit(); } #change the cross_match Xs to Zs so that they can be differentiated from the cross_match Xs from ecoli screen. open (SCREENED, "all.seq.screen")or die "Could not open all.seq.screen|\n"; my @screened = ; close SCREENED; foreach my $line (@screened) { $line =~ s/X/Z/g; } open (SCREEN_F, ">all.seq.screen"); #overwrite all.seq.screen print SCREEN_F @screened; close SCREEN_F; #cross_match for ecoli if ($ecoli_cross) { rename "all.seq.screen", "all.n"; #vector masked sequence produced by cross_match to all.n print "Running cross_match (screening for E. coli sequence) - please wait..."; system("cross_match all.n $ecoli_file -minmatch $eminmatch -minscore $eminscore -screen 2>>cmlogfile 1> /dev/null"); if (-s "all.n.screen") { print LOG "Ecoli crossmatch: all.n.screen - ecoli crossmatch ok.\n"; } else { print LOG "Ecoli crossmatch: No all.n.screen\n"; print colored ("FAILED!\n", "red bold"); print "Check logfile and cmlogfile. trace2dbest will now exit...\n"; exit(); } split_screened("all.n.screen","fasta"); #split output (all.n.screen) into individual files if (glob ("fasta/*.seqsc")) { print LOG "E.coli crossmatch: .seqsc files found in fasta - ecoli cm ok.\n"; print "Done\n"; } else { print LOG "E.coli crossmatch: No .seqsc files found in fasta\n"; print colored ("FAILED!\n", "red bold"); print "Check logfile and cmlogfile. trace2dbest will now exit...\n"; exit(); } unlink glob "all.n*"; unlink glob "all.seq*"; } else { split_screened ("all.seq.screen", "fasta"); unlink ("all.seq","all.seq.log", "all.seq.screen"); } #section 6 -sequence process print colored("\nSection 6 - Sequence processing\n", "bold"); #after basecalling by phred, the sequence is trimmed using the lowest(upstream) and highest(downstream) trim values from the phd #file TRIM: field and the trim values suggested by the -trim arguement (shown in header of fasta dir files). Generally the lowest #and highest values come from the phd file but sometimes -trim gives the lowest trim position. The remaining trim values are used #to define the "high quality" section of the sequence. These values are changed accordingly following the subsequent sequnce #processing to remove vector, ecoli, NNN+ etc. opendir (PHD_DIR, "./phd_dir"); #open the directory to allow looping thru the list of files while (my $file= readdir PHD_DIR) { my ($sequence, $t_hq_start, $t_hq_total, $t_hq_end, $p_hq_start, $p_hq_end, $trim_start, $trim_end, $hi_qual_start, $hi_qual_end, $seq_length, $trace_name, $svector_seq);#clear varibables my $polyA_des = "poly(A) not found"; next if $file =~ /^\./; #avoid . and .. $file=~ s/\.phd\.1/\.seqsc/; #convert file name open (FASTA_FILE, "<./fasta/$file") or die "The file $file in the ./fasta directory could not be opened!\n"; while (my $line = || !eof(FASTA_FILE)) { if ($line =~ />.+\s+\d+\s+(\d+)\s+(\d+)/) { #get the trim info from the fasta header (number of bases to be trimmed at begining and length of seq after trimming $t_hq_start = $1; $t_hq_total =$2; if ($t_hq_total ==0) { $stats[2]++; #add to 'number of low quality traces' stats } } else { chomp $line; $sequence .= $line; #add the sequence to $sequence } } close (FASTA_FILE); $file=~ s/\.seqsc/\.phd\.1/; #and back again open (PHD_FILE, "<./phd_dir/$file") or die "The file $file in the ./phd_dir directory could not be opened!\n"; #open the equivalent file in the phd_dir while (my $line = ){ if($line=~/TRIM:\s(-?\d+)\s(-?\d+)/){ #get 1st and last bases of high quality read segment use -? to allow for presence of -1 $p_hq_start=$1; $p_hq_end=$2; last; #from trim line of phd file } } close(PHD_FILE); #convert trim positions to same format $t_hq_end = $t_hq_start + $t_hq_total; $t_hq_start++; #in fasta file this value refers to number of bases to be trimmed at start, we want position of high quality, so add 1 #get lowest and highest for trimming, use remaining two values for defining high qual section if ($p_hq_start < 0) { $p_hq_start = 0; } if ($t_hq_start >= $p_hq_start) { $trim_start = $p_hq_start-1; #minus 1 to give position of last base to be trimmed at start of raw sequence (= number of bases trimmed at start) $hi_qual_start = $t_hq_start; } else { $trim_start = $t_hq_start -1; #minus 1 to give position of last base to be trimmed at start of raw sequence $hi_qual_start = $p_hq_start; } if ($t_hq_end <= $p_hq_end) { $trim_end = $p_hq_end +1; #add one to give postion of first base to be trimmed at end of raw sequence $hi_qual_end = $t_hq_end; } else { $trim_end = $t_hq_end +1; #add one to give postion of first base to be trimmed at end of raw sequence $hi_qual_end = $p_hq_end; } #trim sequence $seq_length = ($trim_end -$trim_start) -1; #minus 1 to give length of trimmed seq #trim_start shouldn't be negative because that messes up the substr. so if <0 add 1. make new variable to keep trim_start consistent with rest of script my $trim_start_new; if ($trim_start < 0) { $trim_start_new = 0; } else { $trim_start_new = $trim_start; } $sequence = substr ($sequence, $trim_start_new, $seq_length); #convert hi qual regions to positions in trimmed sequence (instead of raw sequence) $hi_qual_start = $hi_qual_start -$trim_start; $hi_qual_end = $hi_qual_end - $trim_start; #adjust high qual region positions #print sequence details to process dir ($trace_name = $file) =~ s/\.phd.1//; open (TRACEINFO, ">./process/traceinfo/$trace_name") or die "The file $trace_name in the ./process/traceinfo directory could not be created/opened!\n"; print TRACEINFO "$trace_name Sequence processing information\n"; print TRACEINFO "Quality trimming information\n"; print TRACEINFO "Position of last base trimmed at start of raw sequence: $trim_start\n"; print TRACEINFO "Position of first base trimmed at end of raw sequence: $trim_end\n"; print TRACEINFO "Length of sequence after trimming: $seq_length\n"; print TRACEINFO "Start position of high quality region in trimmed sequence: $hi_qual_start\n"; print TRACEINFO "End position of high quality region in trimmed sequence: $hi_qual_end\n"; #chuck out sequences that have high qual section < $high_qual_cutoff if (($hi_qual_end - $hi_qual_start) <= $high_qual_cutoff) { my $high_qual_length = $hi_qual_end - $hi_qual_start; if ($high_qual_length < 0) { $high_qual_length =0 } print TRACEINFO "Sequence rejected - high quality length () is less than $high_qual_cutoff\n"; print "$trace_name has beeen rejected, high quality sequence length = $high_qual_length\n"; print LOG "$trace_name has beeen rejected, high quality sequence length = $high_qual_length\n"; next; } #trim adapter (if one has been entered) #do this before vector trimming, otherwise vector trimming may take off a little bit of adapter meaning that this section wouldn't recognise it. if($adapter){ if($sequence =~ s/^(.{0,200}$adapter)//i) { # remove adapter if it appears in first 200 bases my $adapter_trim = $1; my @adapter_trim = split ('', $adapter_trim); my $at_length= @adapter_trim; open (ADAPTERTRIM, ">>./process/adapter_trim") or die "Could not open process/adapter_trim!\n"; print ADAPTERTRIM "$trace_name: $at_length bases were trimmed\nTrimmed sequence: $adapter_trim\n\n"; close ADAPTERTRIM; print TRACEINFO "Adapter trimming\nAdapter sequence: $adapter\nLength of sequence trimmed (from start):$at_length\nSequence trimmed= $adapter_trim\n"; $hi_qual_start = $hi_qual_start - $at_length; #may make hi qual start -ve but that shouldn't matter $hi_qual_end = $hi_qual_end -$at_length; } } #trimming for vector - Xs now converted to Zs if($sequence =~ s/^(.{0,150}Z+)//i) { #substitue sequence beginning with any number of chars between #0 and 150, followed by any number of X's, with nothing. my $svector_seq = $1; my @svector = split('',$svector_seq); my $sv_length = @svector; open (VECTORTRIM, ">>./process/vector_trim") or die "Could not open process/vector_trim!\n"; print VECTORTRIM "$trace_name: $sv_length bases were trimmed at the start of the sequence.\nTrimmed sequence: $svector_seq\n\n"; close VECTORTRIM; print TRACEINFO "Leading vector trimming\nLength trimmed at start= $sv_length\nSequence trimmed= $svector_seq\n"; $hi_qual_start = $hi_qual_start - $sv_length; #may make hi qual start -ve but that shouldn't matter $hi_qual_end = $hi_qual_end -$sv_length; } if($sequence =~ s/(Z+.{0,150})$//i){ # remove trailing vector seq my $evector_seq = $1; my @evector = split('',$evector_seq); my $ev_length = @evector; #sv_length reused open (VECTORTRIM, ">>./process/vector_trim") or die "Could not open process/vector_trim!\n"; print VECTORTRIM "$trace_name: $ev_length bases were trimmed at the end of the sequence.\nTrimmed sequence: $evector_seq\n\n"; close VECTORTRIM; print TRACEINFO "Trailing vector trimming\nLength trimmed at end= $ev_length\nSequence trimmed= $evector_seq\n"; #high qual start shouldn't be affected but high qual end might be. hi qual end can be made = end of seq later. } #trimming for ecoli - Xs still Xs if($sequence =~ s/^(.{0,150}X+)//i) { #substitue sequence beginning with any number of chars between #0 and 150, followed by any number of X's, with nothing. my $secoli_seq = $1; my @secoli = split('',$secoli_seq); my $sv_length = @secoli; open (ECOLITRIM, ">>./process/ecoli_trim") or die "Could not open process/ecoli_trim!\n"; print ECOLITRIM "$trace_name: $sv_length bases were trimmed at the start of the sequence.\nTrimmed sequence: $secoli_seq\n\n"; close ECOLITRIM; print TRACEINFO "Leading E.coli sequence trimming\nLength trimmed at start= $sv_length\nSequence trimmed= $secoli_seq\n"; $hi_qual_start = $hi_qual_start - $sv_length; #may make hi qual start -ve but that shouldn't matter $hi_qual_end = $hi_qual_end -$sv_length; } if($sequence =~ s/(X+.{0,150})$//i){ # remove trailing ecoli seq my $eecoli_seq = $1; my @eecoli = split('',$eecoli_seq); my $ev_length = @eecoli; #sv_length reused open (ECOLITRIM, ">>./process/ecoli_trim") or die "Could not open process/ecoli_trim!\n"; print ECOLITRIM "$trace_name: $ev_length bases were trimmed at the end of the sequence.\nTrimmed sequence: $eecoli_seq\n\n"; close ECOLITRIM; print TRACEINFO "Trailing E.coli sequence trimming\nLength trimmed at end= $ev_length\nSequence trimmed= $eecoli_seq\n"; #high qual start shouldn't be affected but high qual end might be. hi qual end can be made = end of seq later. } #trimming for N's if($sequence =~ s/^(.{0,150}NNN+)//i) { # remove low quality bases if at least 3 in a row my $ntrim_seq = $1; my @ntrim = split('',$ntrim_seq); my $nt_length = @ntrim; open (QUALTRIM, ">>./process/quality_trim") or die "Could not open process/quality_trim!\n"; print QUALTRIM "$trace_name: $nt_length bases trimmed from start of sequence\nTrimmed sequence:$ntrim_seq\n\n"; close QUALTRIM; print TRACEINFO "Start N trimming\nLength of sequence trimmed (from start): $nt_length\nSequence trimmed= $ntrim_seq\n"; $hi_qual_start = $hi_qual_start - $nt_length; #may make hi qual start -ve but that shouldn't matter $hi_qual_end = $hi_qual_end -$nt_length; } #trim SL-1 or equivalent #Most C. elegans genes are trans-spliced at their 5' end to a small leader exon, #called SL-1. Estimates of the actual proportion range from 80% to 60%. The SL-1 sequence can be used as a #5' end marker - any sequence upstream of it can be removed. #this bit of code finds the sl and any sequence before it. #complement sl sequence not trimmed because this is likley to be in the low quality region of the sequence read. #added 6/5/03 #this an optional step and the user can enter their own sequence for removal if ($sl_flag) { #sl sequence to be found and all upstream sequence removed if ($sequence =~ s/^(.*$sl_seq)//i) { my $sl_trim = $1; my @sl_trim = split ('',$sl_trim); my $st_length = @sl_trim; open (ADAPTERTRIM, ">>./process/adapter_trim") or die "Could not open process/adapter_trim!\n"; print ADAPTERTRIM "$trace_name: $sl_seq found, $st_length bases were trimmed\nTrimmed sequence: $sl_trim\n\n"; close ADAPTERTRIM; print TRACEINFO "Trimming for $sl_seq\nLength of sequence trimmed (from start): $st_length\nSequence trimmed= $sl_trim\n"; $hi_qual_start = $hi_qual_start - $st_length; #may make hi qual start -ve but that shouldn't matter $hi_qual_end = $hi_qual_end -$st_length; } } #poly A trimming there is sill the potential that a poly A/T will not be properly trimmed if it overlaps the 150 bp boundary, but this shouldn't cause too much of a problem. if ($sequence =~ /(.{150,}?)(a{$poly_num}.*)/i) { #last occrence of polya in the first 150 or next polya after first 150 #poly_num is the user specified length of poly a tail (default is 8) my $pa_trim = $2; $sequence = $1; #not using substition of $2 so that we get the right string in the right position extracted. my @pa_trim = split ('',$pa_trim); my $pat_length = @pa_trim; open (POLYATRIM, ">>./process/polya_trim") or die "Could not open process/polya_trim!\n"; print POLYATRIM "$trace_name: $pat_length bases were trimmed for poly(A)\nTrimmed sequence: $pa_trim\n\n"; close POLYATRIM; print TRACEINFO "Poly(A) trimming\nLength of sequence trimmed (from end): $pat_length\nSequence trimmed= $pa_trim\n"; $polyA_des = "poly(A) trimmed"; } #polyT trimming - revised algorithm. my $rev_sequence = reverse $sequence; #reverse sequence so we can look for polyT in same way as polyA if ($rev_sequence =~ /(.{150,}?)(t{$poly_num}.*)/i ) { #poly_num is the user specified length of poly a tail (default is 8) my $pt_trim = $2; $rev_sequence = $1; my @pt_trim = split ('',$pt_trim); my $ptt_length = @pt_trim; $pt_trim = reverse $pt_trim; #make pt_trim represent trimmed sequence as it would be in the actual (non reversed) sequence open (POLYATRIM, ">>./process/polya_trim") or die "Could not open process/polya_trim!\n"; print POLYATRIM "$trace_name: $ptt_length bases were trimmed for poly(t)\nTrimmed sequence: $pt_trim\n\n"; close POLYATRIM; print TRACEINFO "Poly(t) trimming\nLength of sequence trimmed (from start): $ptt_length\nSequence trimmed= $pt_trim\n"; $hi_qual_start = $hi_qual_start - $ptt_length; #may make hi qual start -ve but that shouldn't matter $hi_qual_end = $hi_qual_end -$ptt_length; $polyA_des = "poly(T) trimmed"; } $sequence = reverse $rev_sequence; #return $sequence to original orientation #check for Zs (vector) in middle of sequence (leading and trailing Zs already removed). Zs in middle may have occured because the trace represents a chimeric EST or cross_match has mistakenly identified ecoli seq in middle of sequence. they may also be other reasons. $chim_flag = 0; my $mid_z_num = 0; while ($sequence =~ s/[z]/n/i) { $mid_z_num++; $chim_flag = 1; } if ($chim_flag) { open (VECTORTRIM, ">>./process/vector_trim") or die "Could not open process/vector_trim!\n"; print VECTORTRIM "$trace_name is a potential chimera - cross_match has masked $mid_z_num vector bases\n"; print VECTORTRIM "in the middle of this sequence. The cross_match X's have been replaced with n's.\n"; close VECTORTRIM; print TRACEINFO "**potential chimera - cross_match has masked $mid_z_num vector bases in the middle of the sequence!\n"; print "$trace_name is a potential chimera - cross_match has masked $mid_z_num vector bases\n"; print "in the middle of this sequence. The cross_match X's have been replaced with n's.\n"; print LOG "$trace_name is a potential chimera - cross_match has masked $mid_z_num vector bases in the\nmiddle of this sequence. The cross_match X's have been replaced with n's.\n"; } #repeat above for Xs (ecoli) #---check for Xs in middle of sequence (leading and trailing Xs already removed). Xs in middle may have occured because the trace represents a chimeric EST or cross_match has mistakenly identified ecoli seq in middle of sequence. they may also be other reasons. $chim_flag_e = 0; my $mid_x_num = 0; while ($sequence =~ s/[x]/n/i) { $mid_x_num++; $chim_flag_e = 1; } if ($chim_flag_e) { open (VECTORTRIM, ">>./process/vector_trim") or die "Could not open process/vector_trim!\n"; print VECTORTRIM "$trace_name is a potential chimera - cross_match has masked $mid_x_num E.coli bases\n"; print VECTORTRIM "in the middle of this sequence. The cross_match X's have been replaced with n's.\n"; close VECTORTRIM; print TRACEINFO "**potential chimera - cross_match has masked $mid_x_num E.coli bases in the middle of the sequence!\n"; print "$trace_name is a potential chimera - cross_match has masked $mid_x_num E.coli bases\n"; print "in the middle of this sequence. The cross_match X's have been replaced with n's.\n"; print LOG "$trace_name is a potential chimera - cross_match has masked $mid_x_num E.coli bases in the\nmiddle of this sequence. The cross_match X's have been replaced with n's.\n"; } #finalise and check high qual lengths my @sequence = split('',$sequence); my $sequence_length = @sequence; if ($hi_qual_end > $sequence_length) { #adjust high qual end in case it is longer than the sequence length (e.g. if a big chunk of trailing vector was removed) $hi_qual_end = $sequence_length; } if ($hi_qual_start < 1) { #adjust high qual start in case it is less than 1 (e.g. if a big chunk of leading vector was removed) $hi_qual_start = 1; } #chuck out sequences that have high qual section < $high_qual_cutoff if (($hi_qual_end - $hi_qual_start) <= $high_qual_cutoff) { my $high_qual_length = $hi_qual_end - $hi_qual_start; if ($high_qual_length < 0) { $high_qual_length =0 } #make high qual a sensible number if less than zero print TRACEINFO "Sequence rejected - high quality length ($high_qual_length bp) is less than $high_qual_cutoff\n"; print "$trace_name has beeen rejected, high quality sequence length after trimming = $high_qual_length (cut off $high_qual_cutoff)\n"; print LOG "$trace_name has beeen rejected, high quality sequence length after trimming = $high_qual_length (cut off $high_qual_cutoff)\n"; next; } open(OUTFILE,">>subfiles/$trace_name"); print OUTFILE ">$trace_name hq start $hi_qual_start hq end $hi_qual_end $polyA_des"; if ($chim_flag) { print OUTFILE " chimeric(vector)?"; } if ($chim_flag_e) { print OUTFILE " chimeric(E.coli)?"; } print OUTFILE "\n"; my $n=0; while ($sequence =~ /(.)/g) { print OUTFILE "$1"; $n++; if(($n % 60) == 0) { print OUTFILE "\n"; } } if(($n % 60) != 0) { print OUTFILE "\n"; } close OUTFILE; close TRACEINFO; close VECTORTRIM; close QUALTRIM; close ADAPTERTRIM; close POLYATRIM; #if not already closed } # creation of submission files if ($blast == 0) { print colored("\nCreating submission files...\n", "bold"); } else { print colored("\nPerforming BLAST searches and creating submission files...\n", "bold"); } # opendir (DIR, "./subfiles"); while (defined (my $file = readdir(DIR))) { if (($scheme ==1 && $file =~ /^[A-Za-z]{2}_\w{2,5}_(\d{2})([A-Za-z])(\d{2})$/) || ($scheme ==2 && $file =~ /^[A-Z][a-z][A-Z]{2}[0-9]{2}[a-z]([0-9]{2})([a-z])([0-9]{2})[a-z][0-9]_[A-Z][a-z]{2}[A-Z][a-z]$/)) { my ($hi_qual_s, $hi_qual_end, $seq, $put_id, $plate, $row, $column, $poly_A, $chimeric, $chimeric_e); $plate = $1; $row = $2; $column = $3; #get plate coordinates from name to use in submission file open(INFILE,"subfiles/$file.sub"); #create this file for EST sub file open(FASTAFILE,">fastafiles/$file.fsa"); #create this file to hold the processed seq. while (my $line = ) { if ($line=~ /^>.+\shq\sstart\s(\d+)\shq\send\s(\d+)(.*)/) { #regex matching fasta type header of the file $hi_qual_s=$1; $hi_qual_end=$2; my $dollar3 = $3; if ($dollar3) { if ($dollar3 =~ /trimmed/){ $poly_A=1; } if ($dollar3 =~ /chimeric\(vector\)/) { $chimeric =1; } if ($dollar3 =~ /cimeric\(E.coli\)/) { $chimeric_e = 1; } } } else{ # Read in the sequence chop $line; $seq .= $line; } } $stats[3]++; #add to "number of submission files" $stats[4] += length($seq); #gives length of sequence $stats[5]+= ($hi_qual_end - $hi_qual_s); #gives length of high qual part of seq if($blast != 0){ print "Blasting $file..."; $put_id = put_id($blast,$blast_save,$seq,$file,$blast_database, $bits_cutoff); if ($put_id) { print "top hit: $put_id\n"; } else { print "No significant hits found\n"; } } #extract from $primer the primer NAME. $primer should be in format (e.g.) "T7 (aggagagagcgtg)" or "T7" my $primer_name = $seq_primer; if ($primer_name =~ s/\(.*\)//) {} #get name part if ($primer_name =~ s/\s//g) {} #remove whitespace print SUBFILE "TYPE: EST\nSTATUS: New\n"; print SUBFILE "CONT_NAME: $cont_name\nCITATION:\n"; print SUBFILE "$pub_cit\n"; print SUBFILE "LIBRARY: $lib_name\n"; print SUBFILE "EST#: $file\_$primer_name\n"; print SUBFILE "CLONE: $file\n"; print SUBFILE "SOURCE:\n"; print SUBFILE "PCR_F: $pcrf_primer\n"; print SUBFILE "PCR_B: $pcrr_primer\n"; print SUBFILE "PLATE: $plate\n"; print SUBFILE "ROW: $row\n"; print SUBFILE "COLUMN: $column\n"; print SUBFILE "SEQ_PRIMER: $seq_primer\n"; if ($p_end) { print SUBFILE "P_END: $p_end\n"; } print SUBFILE "HIQUAL_START: $hi_qual_s\n"; print SUBFILE "HIQUAL_STOP: $hi_qual_end\n"; print SUBFILE "DNA_TYPE: cDNA\n"; print SUBFILE "PUBLIC: $public_date\n"; if($put_id && $put_id !~ /No\shits\sfound/) { print SUBFILE "PUT_ID: Similar to $put_id\n"; } else { print SUBFILE "PUT_ID:\n"; } if ($poly_A) { print SUBFILE "POLYA: Y\n"; } print SUBFILE "COMMENT:\n"; if ($comment) { print SUBFILE "$comment\n"; } if ($chimeric) { #1 if x's (from vector)(possible chimera) found in sequence print SUBFILE "A section within this EST was identified as potentially derived from\nvector sequence. It has been replaced by 'N' residues.\n"; } if ($chimeric_e) { #1 if x's (from E.coli)(possible chimera) found in sequence print SUBFILE "A section within this EST was identified as potentially derived from\nE.coli sequence. It has been replaced by 'N' residues.\n"; } print SUBFILE "SEQUENCE:\n"; print FASTAFILE ">$file\n"; my $n=0; while ($seq=~/(.)/g) { print SUBFILE "$1"; print FASTAFILE "$1"; $n++; if(($n % 60) == 0) { print FASTAFILE "\n"; print SUBFILE "\n"; } } if(($n % 60) != 0) { print FASTAFILE "\n"; print SUBFILE "\n"; } print SUBFILE "||\n"; close SUBFILE; close FASTAFILE; } } print colored ("Done\n", "bold"); # statistics print colored("\nStatistics\n", "bold"); print ("\n\t$stats[1]\t\tTraces processed\n"); if ($stats[1] == 0) { print "No traces processed. Check logfile, cmlogfile and phred logfile for errors\n"; print "trace2dbest will now exit\n"; exit(); } my $good = $stats[1] - $stats[2]; my $percent = int ($good * 100 / $stats[1]); print ("\t$good ($percent%)\t'Good quality' traces\n"); my $percent2 = int ($stats[3] * 100/$stats[1]); print ("\t$stats[3] ($percent2%)\tSubmissable sequences after trimming\n"); if ($stats[3] == 0) { print "No submissable traces. Check logfile, cmlogfile and phred logfile for errors\n"; print "trace2dbest will now exit\n"; exit(); } my $av_l = int($stats[4] / $stats[3]); print ("\t$av_l\t\tAverage length of submissable sequences\n"); my $av_h = int($stats[5] / $stats[3]); print ("\t$av_h\t\tAverage number of high quality bases for submissable sequences\n"); print ("\n\n"); #write these stats to a file for inclusion in output dir open (STATS, ">>./process/statistics"); print STATS "\n\t### Trace process statistics ###\n"; print STATS "$stats[1]\t\tTraces processed\n"; print STATS "$good ($percent%)\t\t'Good quality' traces\n"; print STATS "$stats[3] ($percent2%)\t\tSubmissable sequences after trimming\n"; print STATS "$av_l\t\tAverage length of submissable sequences\n"; print STATS "$av_h\t\tAverage number of high quality bases for submissable sequences\n"; #Section 7 - submission print colored ("\nSection 7 - Submission and saving of files\n\n", "bold"); #system ("cat subfiles/*.sub > dbEST_submission.txt"); #cat submission records into one file for submission opendir (SUBFILES, "./subfiles") or die "Could not open ./subfiles\n"; my @sub_files = grep /.sub/ , readdir SUBFILES; #don't get . and .. foreach my $sub_file (@sub_files) { open (DBEST, "./subfiles/$sub_file") or die "could not open ./subfile/$sub_file\n"; while (my $line = ) { $dbESTsubmission .= $line; } } #add Pub, Lib and Cont files if (!$no_lib) { #only put these files in sub file if they are to be submitted. $dbESTsubmission = (join "", @Libfile_data).$dbESTsubmission; } if (!$no_pub) { $dbESTsubmission = (join "", @Pubfile_data).$dbESTsubmission; } if (!$no_cont) { $dbESTsubmission = (join "", @Contfile_data).$dbESTsubmission; } #write the sub file to a file open (FINALSUB, ">./dbEST_submission.txt"); print FINALSUB $dbESTsubmission; close FINALSUB; print "The EST submission records have been merged into one file.\n"; #print "Would you like to view this completed submission file? [y/n]"; my $input_view = &get_input("Would you like to view this completed submission file? [y/n]"); chomp $input_view; if ($input_view =~ /y/i) { if ($less_flag == 1) { system("less dbEST_submission.txt"); } else { system("more dbEST_submission.txt"); } } print "\nThe EST submission file may now be sent by e-mail to NCBI dbEST.\n"; print "Enter yes to send file to dbEST now, or any other key to continue without\n"; #print "submitting (your file will be saved): "; my $input_submit = &get_input("submitting (your file will be saved): "); chomp $input_submit; my $sent_flag = 0; if ($input_submit =~ /^yes$/i) { my @info = &sendEST($dbESTsubmission); #send sub file and get reply to for email to egdc my $mail_flag = shift @info; if ($mail_flag) { print "Mail sent - A copy of the mail was sent to the reply email entered above.\n"; $sent_flag = 1; } my $reply_to = shift @info; my $smtp_server = shift @info; print "Enter yes if you would also like to send notification of your submission to the\n"; #print "Environmental Genomics Data Centre (EG awardees only): "; my $input = &get_input("Environmental Genomics Data Centre (EG awardees only): "); chomp $input; if ($input =~ /^yes$/i) { my $mail2_flag = &sendESTegdc($reply_to,$stats[3], $smtp_server); if ($mail2_flag) { print "Mail sent.\n"; $sent_flag = 1; } } else { print "Ok, no notification will be sent to EGTDC.\n"; } } else { print "Ok, file not submitted to dbEST.\n" } #merge logfiles and tidy up. if (-e "phredlogfile") { system ("cat phredlogfile >> logfile"); unlink "phredlogfile"; } if (-e "cmlogfile") { system ("cat cmlogfile >> logfile"); unlink "cmlogfile"; } if (-e "temp.seq") { unlink "temp.seq"; } &save_files($sent_flag, $save_location); #unlink "logfile"; exit(); #######n################## ######################################################/#\#####}##################################################### #################### - - SUB ROUTINES - - ###########/###\####}####################+++++++######################### ####################################################/#####\###}~~~################################################ ########################## sub print_title() { #displays title print colored("\t\t####################################################\n","bold"); print colored("\t\t### ###\n","bold"); print colored("\t\t### trace2dbest Version 2.1 ###\n","bold"); print colored("\t\t### trace file processing and dbEST ###\n","bold"); print colored("\t\t### sequence submission tool ###\n","bold"); print colored("\t\t### ###\n","bold"); print colored("\t\t####################################################\n","bold"); } ##################################################################################### sub file_choices_page() { #Asks where to get Publication/Lib/Cont file info from (and if it is required) my ($file_type) =shift @_; my ($db_file) = shift @_; my ($skipintro) = shift @_; my $answer; unless ($skipintro) { print colored ("\n$file_type file\n", "bold"); print "\n\tEach batch of EST submissions to dbEST must have an associated\n"; print "\t$file_type file.\n"; print "\tPlease choose one of the following options:\n"; } print "\n\t1 - $file_type file already submitted\n"; print "\t2 - Enter information for a new $file_type file now\n"; if (!-r $db_file) { #this loop allows us to reset the location of the files, e.g. reset the $lib_file variable while (!-r $db_file) { #file is (!not)readable, e.g. .db files haven't stayed in installation dir of basedir/file.db #print "Warning: $db_file\nwas not found.\nPlease enter the location of the $file_type file: "; $db_file = &get_input("Warning: $db_file\nwas not found.\nPlease enter the location of the $file_type file: "); chomp $db_file; } #correct .db file location if it has changed from default defined at top of script #and make $db_file point to the actual file if ($file_type eq "Library") { $Lib_file = $db_file."/Libfile.db"; $db_file = $Lib_file; } if ($file_type eq "Publication") { $Pub_file = $db_file."/Pubfile.db"; $db_file = $Pub_file; } if ($file_type eq "Contact") { $Cont_file = $db_file."/Contfile.db"; $db_file = $Cont_file; } } open (DBFILE, "$db_file") or die "Error: $db_file could not be opened!\n"; my $num = 2; my $flag = 1; my $marker = 0; while (my $line =) { if ($marker) { #in the 1st line of TITLE free text in pub file $num++; if ($num == 3) { #we only want to print this line once print "\n\tOr use a saved file...\n\n"; } print "\t$num - "; chomp $line; #keeps pub file entries in same format as others print $line; print "\n"; $marker = 0; } elsif ($line =~ /^TITLE:/) { #pub file $marker = 1; #next line is the title free text } elsif ($line =~ /^NAME:\s(.*)/) { #lib and cont files $num++; if ($num == 3) { #we only want to print this line once print "\n\tOr use a saved file...\n\n"; } print "\t$num - "; print $1; print "\n"; } } close DBFILE; # print "\n\tDon't fancy one of those? Enter h for help or q to quit.\n"; my $flag1=0; while($flag1==0) { $answer= &get_input("\n\tDon't fancy one of those? Enter h for help or q to quit.\n"); chomp $answer; if($answer =~/h/i || $answer =~/q/i ) { $flag1=1; next; } elsif ($answer =~ /^\d+$/) { #if it's only digits if ($answer > $num || $answer == 0) { #if answer is a number outside the right range... print "Please enter a number between 1 and $num, or h for help and q to quit:\n"; } elsif ($answer eq 1) { #else it must be in range so send back if 1 $flag1=1; next; } else { #one of the records from ESTfile.db has been selected $flag1 = 1; next; } } else { #if answer is not digits... print "Please enter a number between 1 and $num, or h for help and q to quit:\n"; } } return $answer; } ############################################################################################ sub get_file_data() { #gets the selected record from ESTfile.db, Libfile.db, Pubfile.db or Contfile.db my ($record_num) = shift @_; my ($db_file) = shift @_; my ($type_id) = shift @_; my $count =0; my $flag; my @rec; if ($type_id ne "RECORD_ID: ") { $record_num = $record_num -2; #set record num for all file types except ESTfile (different because no "file not needed" option) } else { $record_num--; #make record equal to the desired record number for all other types of file } open (DBFILE, "$db_file"); while (my $line =) { if ($line =~ /^$type_id/) { $count++; } if ($count == $record_num) { $flag = 1; } if ($flag) { push (@rec, $line); } if ($line =~ /^\|\|/) { $flag = 0; } } print "\nPlease take a few seconds to check this information\n\n"; print @rec; #print "\nIs this the correct data? [y/n]\n"; my $input = &get_input("\nIs this the correct data? [y/n]\n"); chomp $input; if ($input =~ /y/i) { return @rec; } else { print "Ok, please choose again.\n"; undef @rec; #make the array null to indicate that no file has been selected return @rec; } } ################################################################################################ sub newPub_file_data() { #gets info from user to make Pub file. #General plan - from the user input side, it is better to get all the obligatory info, then ask for the optional #info. Use the status to ask only relevent questions. storing the answers to the obligatory info as varibles # (instead of just putting them straight into the array) means that they can be added later, with the # non-obligatory information. This allows all the data to be put in the array in the right order. my @nPub_file_data; my $ok_flag = 1; while ($ok_flag) { $nPub_file_data[0] = "TYPE: Pub\n"; my $ob_info1 = "STATUS: "; my $ob_info2 = "TITLE: \n"; #set up variables for the obligatory info (\n in TITLE so the free text starts on the next line) my $ob_info3 = "AUTHORS: \n"; my $ob_info4 = "YEAR: "; my ($status, $nob_info1, $nob_info2, $nob_info3, $nob_info4, $nob_info5 ); print "\nInformation for new Publication file\n"; print "Please answer the following questions about the publication for the ESTs you wish\n"; print "to submit.\n"; my $flag = 1; while ($flag == 1) { print "\nWhat is the publication status? 1=unpublished, 2=submitted, 3=in press,\n4=published.\n"; #print "Enter the appropriate number: "; $status = &get_input("Enter the appropriate number: "); chomp $status; if ($status !~ /^[1-4]$/) { print "Please enter a number between 1 and 4!"; } else { $flag =0; } } $ob_info1 .= $status; #save status for later (only ask relevent questions for non-obligatory part of pub file) $ob_info1 .= "\n"; $flag = 1; while ($flag) { print "\nWhat is the title of the article? "; print "(The title you enter here will also be used in the CITATION field of the EST file.)\n"; #print "Title: "; $pub_cit = &get_input("Title: "); #get this for est file $pub_cit.= "\n"; #replace newline removed in get_input if ($pub_cit =~ /\w+/) { #check pub cit has someting in it. $flag =0; } else { print "You must enter something!\n"; } } $ob_info2 .= $pub_cit; $flag = 1; while ($flag) { #print "Who are the authors?\n"; my $input .= &get_input("Who are the authors?\n"); $input .= "\n"; if ($input =~ /\w+/) { $flag = 0; $ob_info3 .= $input; } else { print "You must enter something!\n"; } } $flag = 1; while ($flag) { #print "Year of publication?\n"; my $input .= &get_input("Year of publication?\n"); $input .= "\n"; if ($input =~ /^[0-9]{4}$/) { #only 4 digits $flag = 0; $ob_info4 .= $input; } else { print "You must enter a year in the valid format, e.g. 2004!\n"; } } if ($status == 2 ||$status== 3 ||$status== 4) { #only ask questions that are relevent given the pub status #print "What is the publication journal?\n"; $nob_info1 = "JOURNAL: " . &get_input("What is the publication journal?\n") . "\n"; } if ($status== 4) { #print "What is the volume number? "; $nob_info2 = "VOLUME: " . &get_input("What is the volume number? ") . "\n"; #print "What is the issue number? "; $nob_info3 = "ISSUE: " . &get_input("What is the issue number? ") . "\n"; #print "What is are the page numbers? (e.g. 123-9): "; $nob_info4 = "PAGES: " . &get_input("What's the page numbers? ") . "\n"; #print "Medline unique identifer (if known): "; $nob_info5 = "MEDUID: " .&get_input("Medline unique identifer (if known): ") . "\n"; } #assemble pub file and display. if ok then proceed, else redo! $nPub_file_data [1] = $ob_info2; #add title $nPub_file_data [2] = $ob_info3; #add authors if ($status == 1) { $nPub_file_data [3] = $ob_info4; #add year $nPub_file_data [4] = $ob_info1; #add status } elsif ($status == 2 || $status ==3) { $nPub_file_data [3] = $nob_info1; #add journal $nPub_file_data [4] = $ob_info4; #add year $nPub_file_data [5] = $ob_info1; #add status } elsif ($status == 4) { $nPub_file_data [3] = $nob_info1; #add journal $nPub_file_data [4] = $nob_info2; #add volume $nPub_file_data [5] = $nob_info3; #add issue $nPub_file_data [6] = $nob_info4; #add pages $nPub_file_data [7] = $ob_info4; #add year $nPub_file_data [8] = $ob_info1; #add status } #separate section to add meduid, if present #if ($nob_info5 =~ /MEDUID:\s\w+/) { #true if an identifier entered if ($nob_info5) { unshift @nPub_file_data, $nob_info5; #add nob_info5 to start of the array } print "\n\tPlease check your new Pub file:\n @nPub_file_data\n"; #print "\nAre you happy to continue? (Y/N): "; my $input = &get_input("\nAre you happy to continue? (Y/N): "); chomp $input; if ($input =~ /n/i) { print "Ok, please re-enter the details...\n"; }else { push @nPub_file_data, "||\n"; #add double pipe file separator to end of array &save_new_file(@nPub_file_data); $ok_flag = 0; } } return @nPub_file_data; } ################################################################################################ sub dummyPub_file_data() { #Pub file not needed so get 'title' info for est file my @dPub_file_data; #print "What is the title of the article in the Pub file you have submitted?\n"; my $input .= &get_input("What is the title of the article in the Pub file you have submitted?\n"); #chomp $input; $pub_cit = $input; #get this for est file $dPub_file_data [0] = "TYPE: Pub\n"; $dPub_file_data [1] = "TITLE:\n" .$input; push @dPub_file_data, "||\n"; #add double pipe file separator to end of array return @dPub_file_data; } ################################################################################################ sub newLib_file_data() { #now updated to include all library file fields my @nLib_file_data; my $ok_flag = 1; while ($ok_flag) { @nLib_file_data = ""; $nLib_file_data[0] = "TYPE: Lib\n"; print "\nInformation for new Library file\n"; print "Please answer the following questions about the Library used to generate the ESTs\n"; print "you wish to submit.\n"; my $input; #gets used for all inputs. sometimes use strict seems really stupid. or is it just me? my $flag =1; while ($flag) { #print "What is the name of the library?\n"; $input = &get_input("What is the name of the library?\n"); chomp $input; if ($input) { #check that something has been entered $flag = 0; } else { print "You must enter the library name\n"; } } $nLib_file_data[1] = "NAME: " . $input . "\n"; $flag =1; #reset flag for next question while ($flag) { #print "What is the scientific name of the organism from which the library was prepared?\n"; $input = &get_input("What is the scientific name of the organism from which the library was prepared?\n"); chomp $input; if ($input =~ /[A-Z]+\s[A-Z]+/i) { #rough match of a binomial name $flag=0; } else { print "You must enter the bionomial scientific name of the organism\n"; } } $nLib_file_data[2] = "ORGANISM: " . $input ."\n"; print "The following information is not compulsory but please enter as much as you can.\nIf you can't enter the requested information, press enter.\n"; #print "Organism strain: "; my $number = 3; #for place in array. could use push or something but this works as well. $input = &get_input("Organism strain: "); chomp $input; if ($input) { $nLib_file_data[$number]= "STRAIN: " .$input ."\n" ; $number++; } #print "Cultivar: "; $input = &get_input("Cultivar: "); chomp $input; if ($input) { $nLib_file_data[$number]= "CULTIVAR: " .$input ."\n" ; $number++; } #print "Sex: "; $input = &get_input("Sex: "); chomp $input; if ($input) { $nLib_file_data[$number]= "SEX: " .$input ."\n" ; $number++; } #print "Organ: "; $input = &get_input("Organ: "); chomp $input; if ($input) { $nLib_file_data[$number]= "ORGAN: " .$input ."\n" ; $number++; } #print "Tissue: "; $input = &get_input("Tissue: "); chomp $input; if ($input) { $nLib_file_data[$number]= "TISSUE: " .$input ."\n" ; $number++; } #print "Cell type: "; $input = &get_input("Cell Type: "); chomp $input; if ($input) { $nLib_file_data[$number]= "CELL_TYPE: " .$input ."\n" ; $number++; } #print "Cell line: "; $input = &get_input("Cell Line: "); chomp $input; if ($input) { $nLib_file_data[$number]= "CELL_LINE: " .$input ."\n" ; $number++; } #print "Stage: "; $input = &get_input("Stage: "); chomp $input; if ($input) { $nLib_file_data[$number]= "STAGE: " .$input ."\n" ; $number++; } #print "Host: "; $input = &get_input("Host: "); chomp $input; if ($input) { $nLib_file_data[$number]= "HOST: " .$input ."\n" ; $number++; } #print "Vector: "; $input = &get_input("Vector: "); chomp $input; if ($input) { $nLib_file_data[$number]= "VECTOR: " .$input ."\n" ; $number++; } #print "Vector type: "; $input = &get_input("Vector type: "); chomp $input; if ($input) { $nLib_file_data[$number]= "V_TYPE: " .$input ."\n" ; $number++; } #print "Restriction enzyme 1: "; $input = &get_input("Restriction enzyme 1: "); chomp $input; if ($input) { $nLib_file_data[$number]= "RE_1: " .$input ."\n" ; $number++; } #print "REstriction enzyme 2: "; $input = &get_input("Restriction enzyme 2: "); chomp $input; if ($input) { $nLib_file_data[$number]= "RE_2: " .$input ."\n" ; $number++; } #print "Library description: "; $input = &get_input("Library description: "); chomp $input; if ($input) { $nLib_file_data[$number]= "DESCR: \n" .$input ."\n" ; $number++; } print "\n\tPlease check your new Lib file:\n @nLib_file_data\n"; #print "\nAre you happy to continue? (Y/N): "; $input = &get_input("\nAre you happy to continue? (Y/N): "); chomp $input; if ($input =~ /n/i) { print "Ok, please re-enter the details...\n"; }else { push @nLib_file_data, "||\n"; #add double pipe file separator to end of array &save_new_file(@nLib_file_data); $ok_flag = 0; } } return @nLib_file_data; } ################################################################################################ sub dummyLib_file_data() { #Lib file not needed so get 'name' info for est file my @dLib_file_data; my $flag=1; my $input; while ($flag) { #print "What is the name of the library?\n"; $input .= &get_input("What is the name of the library?\n"); chomp $input; if ($input) { $flag=0; } else { print "You must enter the library name\n"; } } #print "From what organism are the ESTs derived (e.g. Lumbricus rubellus)?\n"; $species = &get_input("From what organism are the ESTs derived (e.g. Lumbricus rubellus)?\n"); chomp $species; $flag = 1; while ($flag) { if ($species !~ /[A-Z]*\s[A-Z]/i) { #print "Please enter the binomial name for the species\n"; $species = &get_input("Please enter the binomial name for the species\n"); chomp $species; } else { $flag =0; } } $dLib_file_data [0] = "TYPE: Lib\n"; $dLib_file_data [1] = "NAME: " .$input; push @dLib_file_data, "||\n"; #add double pipe file separator to end of array return @dLib_file_data; } ################################################################################################ sub newCont_file_data() { my @nCont_file_data; my $ok_flag = 1; while ($ok_flag) { @nCont_file_data = ""; $nCont_file_data[0] = "TYPE: Cont\n"; print "\nInformation for new Contact file\n"; print "Please answer the following questions about the contact person for the ESTs\n"; print "you wish to submit.\n"; my $input; #gets used for all inputs. my $flag =1; while ($flag) { #print "What is the name of the person submitting the ESTs?\n"; $input = &get_input("What is the name of the person submitting the ESTs?\n"); chomp $input; if ($input) { #check that something has been entered $flag = 0; } else { print "You must enter a contact name\n"; } } $nCont_file_data[1] = "NAME: " . $input . "\n"; print "The following information is not compulsory but please enter as much as you can.\nIf you can't enter the requested information, press enter.\n"; #print "FAX number: "; my $number = 2; $input = &get_input("FAX number: "); chomp $input; if ($input) { $nCont_file_data[$number]= "FAX: " .$input ."\n" ; $number++; } #print "Telephone number: "; $input = &get_input("Telephone number: "); chomp $input; if ($input) { $nCont_file_data[$number]= "TEL: " .$input ."\n" ; $number++; } #print "Email: "; $input = &get_input("Email: "); chomp $input; if ($input) { $nCont_file_data[$number]= "EMAIL: " .$input ."\n" ; $number++; } #print "Laboratory providing EST: "; $input = &get_input("Laboratory providing EST: "); chomp $input; if ($input) { $nCont_file_data[$number]= "LAB: " .$input ."\n" ; $number++; } #print "Institution name: "; $input = &get_input("Institution name: "); chomp $input; if ($input) { $nCont_file_data[$number]= "INST: " .$input ."\n" ; $number++; } #print "Address (comma delineate): "; $input = &get_input("Address (comma delineate): "); chomp $input; if ($input) { $nCont_file_data[$number]= "ADDR: " .$input ."\n" ; $number++; } print "\n\tPlease check your new Cont file:\n @nCont_file_data\n"; #print "\nAre you happy to continue? (Y/N): "; $input = &get_input("\nAre you happy to continue? (Y/N): "); chomp $input; if ($input =~ /n/i) { print "Ok, please re-enter the details...\n"; }else { push @nCont_file_data, "||\n"; #add double pipe file separator to end of array &save_new_file(@nCont_file_data); $ok_flag = 0; } } return @nCont_file_data; } ################################################################################################ sub dummyCont_file_data() {#Cont file not needed so get 'cont name' info for est file my @dCont_file_data; my $flag=1; my $input; while ($flag) { #print "What is the name of the person submitting the ESTs?\n"; $input .= &get_input("What is the name of the person submitting the ESTs?\n"); chomp $input; if ($input) { $flag =0; } else { print "You must enter a name\n"; } } $dCont_file_data [0] = "TYPE: Cont\n"; $dCont_file_data [1] = "NAME: " .$input; push @dCont_file_data, "||\n"; #add double pipe file separator to end of array return @dCont_file_data; } ################################################################################################ sub save_new_file() { #allows the new EST, Pub, Lib, or Cont file to be saved to the relevant .db file my @new_file = @_; my $file; #gets set to scalar rep one of the .db files #print "Would you like to save this file for future use?(y/n):"; my $input = &get_input("Would you like to save this file for future use?(y/n):"); chomp $input; if ($input =~ /y/i) { foreach my $line (@new_file) { if ($line =~ /TYPE:\s.*Pub/) { $file = $Pub_file; } elsif ($line =~ /TYPE:\s.*Lib/) { $file = $Lib_file; } elsif ($line =~ /TYPE:\s.*Cont/) { $file = $Cont_file; } elsif ($line =~ /TYPE:\s.*EST/) { $file = $EST_file; #print "Please enter a name for this file: "; #get name for this record so it can be identifed when recalling saved files my $id = &get_input("Please enter a name for this file: "); chomp $id; unshift @new_file, "RECORD_ID: $id\n"; last; } } open (DBFILE, ">>$file") or die "Could not open $file\n"; #use >> for appending to file print DBFILE @new_file; print "File saved to $file\n"; close DBFILE; } else { print "Ok, file will not be saved.\n"; } } ################################################################################################ sub EST_file_data () { #allows user to select saved file or enter new EST file info (via sub new_EST_file), both return an array with the file data. my $sub_flag = 1; print colored ("\nEST File\n", "bold"); print "\n\tEach sequence submitted to dbEST must have an associated EST file.\n\tPlease choose one of the following options for the creation of this file:\n\n"; while ($sub_flag) { $sub_flag = 0; #hope everything goes ok print "\t1 - Enter information for a new EST file now\n"; if (!-r $EST_file) { #this loop allows us to reset the location of the file while (!-r $EST_file) { #file is (!not)readable, e.g. .db files haven't stayed in installation dir of basedir/file.db #print "Warning: $EST_file was not found.\nPlease enter the location of the EST file: "; $EST_file = &get_input("Warning: $EST_file was not found.\nPlease enter the location of the EST file: "); chomp $EST_file; } } open (ESTFILE, "$EST_file") or die "Error: $EST_file could not be opened!\n"; my $num = 1; while (my $line =) { if ($line =~ /^RECORD_ID:\s(.*)/) { $num++; if ($num == 2) { #we only want to print this line once print "\n\tOr use a saved file...\n\n"; } print "\t$num - "; print $1; print "\n"; } } close ESTFILE; # print "\n\tDon't fancy one of those? Enter h for help or q to quit.\n"; my $flag1=0; while($flag1==0) { my $answer= &get_input("\n\tDon't fancy one of those? Enter h for help or q to quit.\n"); chomp $answer; if($answer=~ /h/i) { &print_help(); next; } #use less to show trace2dbest.doc if($answer=~ /q/i) { print "Ok, trace2dbest exiting...\nBye \n"; exit(); } #exit program if($answer =~/h/i || $answer =~/q/i ) { $flag1=1; next; } elsif ($answer =~ /^\d+$/) { #if it's only digits if ($answer > $num || $answer == 0) { #if answer is a number outside the right range... print "Please enter a number between 1 and $num, or h for help and q to quit:\n"; } elsif ($answer eq 1) { @ESTfile_data = &new_EST_file(); $flag1=1; } else { #one of the records from ESTfile.db has been selected my $type_id = "RECORD_ID: "; #set type id for get_file_data @ESTfile_data = &get_file_data($answer, $EST_file, $type_id); $flag1 = 1; } } else { #if answer is not digits... print "Please enter a number between 1 and $num, or h for help and q to quit:\n"; } } if (@ESTfile_data) { return @ESTfile_data; } else { $sub_flag = 1; #everything not ok, return to near start } } } ################################################################################################ sub new_EST_file() { #gets info from user to make EST.db file my @nEST_file_data; my $input; my $ok_flag = 1; while ($ok_flag) { my $flag = 1; while ($flag) { $flag=0; #print "\nWhat was the sequencing primer used? [enter the primer name, optionally followed by the\nsequence in brackets()].\nE.g. SAC(GGGAACAAAAGCTGGAG): "; $input = &get_input("\nWhat was the sequencing primer used? [enter the primer name, optionally followed by the\nsequence in brackets()].\nE.g. SAC(GGGAACAAAAGCTGGAG): "); chomp $input; unless ($input) { #simple check for some kind of input print "You must enter at least the name of the sequencing primer.\n"; $flag=1; } $nEST_file_data[0] = "SEQ_PRIMER: $input\n"; } #print "What was the forward PCR primer? "; $input = get_input("What was the forward PCR primer? "); chomp $input; $nEST_file_data[1] = "PCR_F: $input\n"; #print "What was the reverse PCR primer? "; $input = get_input("What was the reverse PCR primer? "); chomp $input; $nEST_file_data[2] = "PCR_B: $input\n"; $flag =1; while ($flag) { #print "Which end was sequenced (5'/3')? If neither, just hit return: "; $input = get_input("Which end was sequenced (5'/3')? If neither, just hit return: "); chomp $input; if ($input =~ /(5'|3')/ || !$input) { $flag = 0; } else { print "Please enter 5' or 3'\n"; } } $nEST_file_data[3] = "P_END: $input\n"; #get release date print "\nPlease enter the date you would like your data to be made public. This date\n"; print "will be inserted into the PUBLIC field of the submission file.\n"; #print "Enter the date in the format MM/DD/YYYY. For immediate release, just press enter.\n"; my $input = get_input("Enter the date in the format MM/DD/YYYY. For immediate release, just press enter.\n"); $input .= "\n"; if ($input) { while ($input !~ /(\d\d)\/(\d\d)\/(\d\d\d\d)\n/) { if ($input=~/^\s$/) { $input = ""; last; } else { print "Please re-enter the date in the format MM/DD/YYYY\n"; $input = get_input("Please re-enter the date in the format MM/DD/YYYY\n"); $input .= "\n"; } } } chomp $input; $nEST_file_data[4] = "PUBLIC: $input\n"; #get comment #print "Please enter a comment about the ESTs for inclusion in the \"COMMENT\" field of\nthe EST file.\n"; $input = get_input("Please enter a comment about the ESTs for inclusion in the \"COMMENT\" field of\nthe EST file.\n"); chomp $input; $nEST_file_data[5] = "COMMENT: $input\n"; #now check it's ok and save then return. print "\n\tPlease check the data you have entered:\n @nEST_file_data\n"; print "\nAre you happy to continue? (Y/N): "; $input = get_input("\nAre you happy to continue? (Y/N): "); chomp $input; if ($input =~ /n/i) { print "Ok, please re-enter the details...\n"; }else { unshift @nEST_file_data, "TYPE: EST\n"; #add file type to start of array push @nEST_file_data, "||\n"; #add double pipe file separator to end of array &save_new_file(@nEST_file_data); $ok_flag = 0; } } return @nEST_file_data; } ################################################################################################ sub Get_vectors () { #uses vector file path to read in (fasta format) file and print out vector names (fasta headers) my ($file) = @_; my $line; my @vectors; my $count; my $input; my $return; #the return value, 1 for vector not there (or no FASTA headers found), 0 if it is. open (VECTORFILE, "<$file") || die "$file could not be opened"; while ($line = ) { if ($line =~ /^>/) { push @vectors, $line; $count++; } } if ($count) { print "\nThe following $count vectors were found in $file\n\n"; print @vectors; #print "\nAre you satisfied that the relevant vector is here? (y/n): "; $input = &get_input("\nAre you satisfied that the relevant vector is here? (y/n): "); chomp $input; if ($input =~ /n/i) { print "Ok, please re-select a vector file...\n\n"; $return =1; } }else { print "ERROR: No FASTA headers were found in $file.\nPlease re-select a FASTA format file...\n\n"; $return =1; } return $return; } ################################################################################# sub split_screened { # converts a fasta file to individual # txt files my $file=shift(@_); my $dir=shift(@_); my $flag=0; my $in_seq= ""; my @heading; my $header; my $filename; open(FSAFILE, "$file") || die "Can't open $file\n";; #catenated cross_match output file while(my $line=) { if($flag==1 && $line!~/^>/) { chop $line; $in_seq.=$line; } if($flag==1 && $line=~/^>/) { # print to file $flag=0; open(OUTFILE, ">$dir/$filename.seqsc") || die "Can't open $dir/$filename.seqsc\n"; print OUTFILE "$header\n"; my $n=0; while ($in_seq=~/(.)/g) { print OUTFILE "$1"; $n++; if(($n % 60) == 0) { print OUTFILE "\n"; } } if(($n % 60) != 0) { print OUTFILE "\n"; } $in_seq=""; close OUTFILE; } if($flag==0 && substr($line,0,1) eq ">" ) { #an unusual regex. chop $line; $flag=1; $header=$line; @heading = split(' ',$line); $heading[0] =~ s/>//; $filename = $heading[0]; next; } } #print last record to file open(OUTFILE, ">>$dir/$filename.seqsc") || die "Can't open $filename.txt\n"; print OUTFILE "$header\n"; my $n=0; while ($in_seq=~/(.)/g) { print OUTFILE "$1"; $n++; if(($n % 60) == 0) { print OUTFILE "\n"; } } if(($n % 60) != 0) { print OUTFILE "\n"; } close OUTFILE; } ################################################################################################ sub put_id { #runs BLAST (either local or remote) and gets id line of top hit my $blast_type = shift (@_); my $blast_save = shift (@_); my $sequence = shift (@_); my $file = shift (@_); my $blast_db = shift (@_); my $bits_cutoff = shift (@_); #sounds painful. my $prot_id = ""; my $flag = 0; my $evalue = "0"; my $scores; my $bits; #ooh er open(TEMPFILE,">temp.seq"); print TEMPFILE ">Query\n$sequence\n"; close TEMPFILE; while ($flag < 5) { ## Attempt blast 5 times if ($blast_type == 1) { system ("blastcl3 -p blastx -d nr -i temp.seq -o blast.out -T F"); my $aflag = 0; while ($aflag == 0) { wait(); open(FH,"blast.out"); if( -s FH > 550) { $aflag=1; close(FH); next; } #-s =non-zero file test operator close(FH); } } else { system("blastall -p blastx -d $blast_db -i temp.seq -o blast.out -T F"); } open(FH,"blast.out"); if(-s FH > 550) { $flag=10; close(FH); next; } close(FH); $flag++; } if($flag==10){ open(BLASTFILE,">blast_reports/blasts_full"); print BLASTSAVE "\n==========================================================================\n"; print BLASTSAVE "\n$file BLAST report\n"; } $flag=0; while (my $line=) { if ($blast_save ==1) { print BLASTSAVE "$line"; } if($line=~/^>(.*)/ && $flag==0) { $prot_id=$1; $flag=1; next; } if($line =~ /Expect\s=\s(\w.*)/ && $flag==1) { chomp $line; $scores = $line; $scores =~ /Score\s+=\s+(\d+)/; $bits = $1; #print "bit score $bits\n"; if ($bits >= $bits_cutoff) { $evalue=1; #use as flag to say that hit was found } last; } } if ($evalue ==0) { #catch no hit situation $prot_id = ""; } else { $prot_id =~ s/>.*//; #remove > from hit description $prot_id =~ s/\[(.*)\]/ \- $1/; #remove square brackets from descr $prot_id .= ". $scores"; #add score and e-value open (BLASTHIT, ">>blast_reports/blasts_tophit"); print BLASTHIT ("$file $prot_id\n"); } } else { print LOG "$file blast failed\n"; } unlink("blast.out"); #removes blast.out close BLASTFILE; close BLASTHIT; return $prot_id; } ############################################################################### sub sendEST (){ #send the submission file by email to dbEST use Mail::Mailer; my ($dbESTsend) = @_; my $flag = 1; #flag set to 0 if mail returns error my $recipient = "batch-sub\@ncbi.nlm.nih.gov"; #my $recipient = "al.anthony\@ed.ac.uk"; print "The EST submission file will now be sent by e-mail to\n"; print "$recipient\n"; #print "Please enter your e-mail address for replies:\n"; my $replyto = &get_input("Please enter your e-mail address for replies:\n"); chomp $replyto; my $subject = "EST submission"; my $body = "\n$dbESTsend"; #print "Please enter your SMTP server for outgoing mail (your\nlocal computer support will be able to tell you what this is):\n"; my $SMTP_SERVER = &get_input("Please enter your SMTP server for outgoing mail (your\nlocal computer support will be able to tell you what this is):\n"); chomp $SMTP_SERVER; #my $mailprog = "sendmail"; print "Sending to $recipient...\n"; #open (MAIL, "|$mailprog -t"); #print MAIL "To: $recipient\n"; #print MAIL "Reply-to: $replyto\n"; #print MAIL "Subject: EST submission\n"; #print MAIL "\n$dbESTsend"; #close (MAIL); eval { my $mailer = Mail::Mailer->new("smtp", Server => $SMTP_SERVER); $mailer -> open ({ From => "", To => $recipient, Cc => $replyto, 'Reply-to' => $replyto, Subject => $subject }); print $mailer $body; $mailer ->close; }; if ($@) { #this variable will contain any error messages returned print colored ("\nWARNING! - There was a problem sending this email.\n", "green bold"); print "see logfile for details. trace2dbest continuing...\n\n"; open (LOG, ">>logfile"); print LOG "MAIL WARNING!\n$@\nsmtp server was $SMTP_SERVER\nrecipeint was $recipient\nreply to was $replyto\n"; close LOG; $flag = 0; } #put smtp server and reply to in array and return it for use in EGTDC mail my @info = ($flag, $replyto, $SMTP_SERVER); return @info; } #################################################################################### sub sendESTegdc (){ #send brief details of submission to EGTDC my $replyto = shift @_; my $num_est = shift @_; my $SMTP_SERVER = shift @_; my $flag = 1; #flag set to 0 if mail returns error my $recipient = "EGTDC\@ceh.ac.uk"; #my $recipient = "al.anthony\@ed.ac.uk"; print "Information about your dbEST submission will now be sent by\n"; print "email to $recipient\n"; #print "Please enter a contact name: \n"; my $input = &get_input("Please enter a contact name: \n"); chomp $input; my $cont_name = $input; #print "...and, for EGTDC records, the source organism for the ESTs: \n"; my $input1 = &get_input("...and, for EGTDC records, the source organism for the ESTs: \n"); chomp $input; my $org_name = $input1; my $IP_add = `/sbin/ifconfig`; #get IP address of machine my $ip; $IP_add =~ /inet addr:(\d+\.\d+\.\d+\.\d+)/; $ip=$1; my $date = `date -I`; #get date my $EGDC_text = "Notification of E-mail submission to dbEST.\n\nContact name: $cont_name\nIP address: $ip\nOrganism name: $org_name\nDate of submission: $date\nNumber of ESTs submitted: $num_est\n"; my $subject = "trace2dbest EST submission"; my $body = "\n$EGDC_text"; eval { my $mailer = Mail::Mailer->new("smtp", Server => $SMTP_SERVER); $mailer -> open ({ From => "", To => $recipient, Cc => $replyto, 'Reply-to' => $replyto, Subject => $subject }); print $mailer $body; $mailer ->close; }; if ($@) { #this variable will contain any error messages returned print colored ("\nWARNING - There was a problem sending this email.\n", "green bold"); print "see logfile for details. trace2dbest continuing...\n\n"; open(LOG, ">>logfile"); print LOG "MAIL WRNING!\n$@\nsmtp server was $SMTP_SERVER\nrecipeint was $recipient\nreply to was $replyto\n"; close LOG; $flag = 0; } #my $mailprog = "sendmail"; #print "Sending to $recipient...\n"; #open (MAIL, "|$mailprog -t"); #print MAIL "To: $recipient\n"; #print MAIL "Reply-to: $replyto\n"; #print MAIL "Subject: EST submission\n"; #print MAIL "\n$EGDC_text"; #close (MAIL); return $flag; } ################################################################################ sub save_files (){ #saves all the output files in the users home dir or the standard common location. my ($sent) = shift @_; my ($place) = shift @_; my $location; my @traces; my $date= `date --iso-8601=minutes`; #get date and time to use as a unique identifier for file $date =~ /(.+)\+/ ; my $x = $1; #$x stores date in format: 2003-07-03T11:52 for example if ($sent) { #add tag to indicate if file was submitted to dbEST or not $x .= "submitted"; } else { $x .= "unsubmitted"; } if ($place) { $location = "/home/db/est_solutions/$species/trace2dbest/$x"; } else { $location = "$homedir/est_solutions/$species/trace2dbest/$x"; } mkdir "$location", 0777 or die "Can't make directory $location/raw_traces\n$!\n"; my @files = ("dbEST_submission.txt", "blast_reports/","fasta/","fastafiles/","partigene/","phd_dir/","process/","qual/","scf/","subfiles/"); foreach my $file (@files) { rename "$file", "$location/$file" or die "There was an error moving $file. Check logfile for details\n", print LOG "rename $file to $location$file failed with $!\n"; } #put relevant files in partigene dir my @quals = glob ("$location/qual/*.qual"); foreach my $qual (@quals) { copy ("$qual" , "$location/partigene") or die "Can't copy $qual to $location/partigene\n", print LOG "$qual to $location/partigene:$!\n"; } my @seqs = glob ("$location/fasta/*.seq"); foreach my $seq (@seqs) { copy ("$seq" , "$location/partigene") or die "Can't copy $seq/* to $location/partigene\n", print LOG "$seq/* to $location/partigene:$!\n"; } mkdir "$location/raw_traces", 0777 or die "Can't make directory $location/raw_traces\n$!\n"; @traces = glob ("$tracedir/*"); foreach my $file (@traces) { copy ($file, "$location/raw_traces")or die "Can't copy $tracedir/* to $location/raw_traces\n", print LOG " $tracedir/* to $location/raw_traces:$!\n"; } close LOG; rename "logfile", "$location/logfile" or die "Could not move logfile!\n$!"; print "\n\nThe EST submission file and the other output directories have been\nsaved in the following location:\n"; print "$location\n\n"; print colored ("Trace2dbest has finished and will now exit. Bye.\n\n", "bold"); } ######################################################################################################################## sub print_help() { #prints help document using less system("less $basedir/trace2dbest2.0_UG.txt"); } ########################################################################################################################## sub query_for_exit() { #exits program if 'n' entered my $input=''; while($input!~/y|n/i) { #print "\b"; $input= &get_input("\b"); chomp $input; } if($input=~/^n/i) { print "Bye \n"; exit(); } } ########################################################################################################################## sub get_input() { #uses either readline module or the traditional way to get user response to question printed to screen my $input; my $question = shift (@_); if ($read_gnu) { #true if gnu readline module installed $input = $term->readline("$question"); #print question and get user input } else { #do it the old way, without readline print "$question"; $input =<>; } chomp $input; #remove trailing newline $input =~ s/\s*$//; #remove trailing space return $input; } #############################################################################the end#########################################