Again, you outperformed! We still have to check your primers but you worked as scientist should, with care and analyzing alternative hypotesis rather than launching a couple of programs and taking note of the output. Good work!
Some of you asked for the script. Well, you deserve something more: an updated script 🙂
Did you notice the bottleneck of the script? It takes a while to load the sequence of the two contigs parsing a long file. So I decided to split the 454contigs.fna file producing 1 file for each contig. Thus knowing the name of the contig, you can directly open its file (eg. contig00323.fna).
The script (after the breack) was accordingly modified…
#!/usr/bin/perl # ==CONSTANTS== # we write here the FILENAME of the multifasta # file containing the contigs # paths: $multifasta = '/home/geno/tools/contigs.fna'; $contigs = '/home/geno/tools/contigs'; $primer3prog= '/home/geno/tools/primer3/src/primer3_core'; $blastprog = '/home/geno/tools/blast/bin/blastall'; # Size of target region for primer3 $targetsize = 50; # Maximum contig size (we trim the contig if larger than this) $maxcontigsize = 2500; # Defaults: $desiredproductsize = "300-600"; $evalmax = 0.5; #minimum e-value when blasting an oligo $deltalen = 2; #difference between oligo length and aspecific mismatch $optTM = 60; $optPLEN= 20; # print a MESSAGE for the user listing EXPECTED PARAMETERS print STDERR q( +----------------------------------------------------+ | __ __ __ __ __ __ __ __ | | --/\- P I C K --\ /--X--\ /--/\-- | | -/ \--\ /--/ \-P R I M E R S --/ \-- | | \__X__/ \__\/__/ \__X__/ \-- | +----------------------------------------------------+ | Parameters: | | contig1 dir1 contig2 dir2 [tm] [prLen] [pcrSize] | | | | E.g.: contig03291 C contig09482 U | +----------------------------------------------------+ ); # We populate variables with USER PARAMETERS ($contig1, $dir1, $contig2, $dir2, $userTM, $userPLEN, $userPCRSIZE) = @ARGV; if ($userTM) { $optTM = $userTM; } if ($userPLEN) { $optPLEN= $userPLEN;} if ($userPCRSIZE) {$desiredproductsize=$userPCRSIZE;} $minTM=$optTM-1.5; $maxTM=$optTM+1.5; $maxPLEN = $optPLEN+1; $minPLEN = $optPLEN-1; unless (-e $primer3prog) { die " FATAL ERROR:\n Unable to locate Primer3: $primer3prog.\n"; } unless (-e $blastprog) { die " FATAL ERROR:\n Unable to locate BLAST: $blastprog.\n"; } # Last argument is empty? Then let's ask the user to set 4 parameters if ($dir2 eq '') { die " FATAL ERROR:\n This program requires 4 parameters.\n"; } open (O, ">log-$contig1$dir1-$contig2$dir2.txt") || die " FATAL ERROR: Unable to write log file \"log-$contig1$dir1-$contig2$dir2.txt\".\n"; print O "contig1\t$contig1$dir1\ncontig2\t$contig2$dir2\n"; # So... these are the user supplied parameters: print STDERR " INPUT DATA: contig1 $contig1\t$dir1 contig2 $contig2\t$dir2 Annealing $optTM\°C [$minTM\°C - $maxTM\°C] Primer $optPLEN [$minPLEN - $maxPLEN] Product $desiredproductsize bp "; $seq1 = loadsequence($contig1); $seq2 = loadsequence($contig2); # Control: did we populate both $seq1 and $seq2? # Suppose that the user mistyped one or both contig names! if (length($seq1) == 0) { $notfound.= "FATAL ERROR:\n\"$contig1\" was not found!\n"; } if (length($seq2) == 0) { $notfound.= "FATAL ERROR:\n\"$contig2\" was not found!\n"; } if ($notfound) { die $notfound; } # Calculate rev-compl (using a subroutine) if ($dir1 eq 'C') { $seq1 = rc($seq1); } if ($dir2 eq 'C') { $seq2 = rc($seq2); } # Let's trim contigs if they are larger than $maxcontigsize. This will reduce # primer3 execution time. if (length($seq1)>$maxcontigsize) { print STDERR "$contig1 was reduced to $maxcontigsize bp to accelerate primer picking.\n"; $startfrom = length($seq1)-$maxcontigsize; $seq1 = substr($seq1, $startfrom, $maxcontigsize); } if (length($seq2)>$maxcontigsize) { print STDERR "$contig2 was reduced to $maxcontigsize bp to accelerate primer picking.\n"; $seq2 = substr($seq2, 0, $maxcontigsize); } $template = $seq1.'nn'.$seq2; $start_point = length($seq1)-int($targetsize/2); $primer3conf = "SEQUENCE_ID=GAP-$contig1$dir1-$contig2$dir2 PRIMER_TM_FORMULA=1 PRIMER_SALT_CORRECTIONS=1 SEQUENCE_TEMPLATE=$template SEQUENCE_TARGET=$start_point,$targetsize PRIMER_MIN_TM=$minTM PRIMER_MAX_TM=$maxTM PRIMER_OPT_TM=$optTM PRIMER_TASK=pick_detection_primers PRIMER_PICK_LEFT_PRIMER=1 PRIMER_PICK_INTERNAL_OLIGO=0 PRIMER_PICK_RIGHT_PRIMER=1 PRIMER_MIN_SIZE=$minPLEN PRIMER_OPT_SIZE=$optPLEN PRIMER_MAX_SIZE=$maxPLEN PRIMER_MAX_NS_ACCEPTED=0 PRIMER_PRODUCT_SIZE_RANGE=$desiredproductsize P3_FILE_FLAG=0 PRIMER_EXPLAIN_FLAG=0 ="; # Launch primer3 $command = "echo \"$primer3conf\" | $primer3prog"; print STDERR "Launching Primer3... "; @primer3output = qx($command); print STDERR "Done.\n"; print O ">Template-seq\n$template\n\nPRIMER3INPUT:\n$primer3conf\n\nPRIMER3OUTPUT:\n@primer3output\n"; print STDERR "Parsing Primer3 output...\n"; # We save relevant information from Primer3 output to the %primer hash foreach $line (@primer3output) { #print "$line"; chomp($line); ($name, $value) = split(/=/, $line); if ($name=~/PRIMER_(PAIR|LEFT|RIGHT)_(\d+)_(\w+)$/) { $primers{$name}=$value; $lastprimerid = $2 if ($2 > $lastprimerid); } } # Analyze each primer pair for ($primerid = 0; $primerid <= $lastprimerid; $primerid++) { $pcrprod = $primers{"PRIMER_PAIR_$primerid\_PRODUCT_SIZE"}; print "\n#Primer pair $primerid: $pcrprod bp\n"; print O "\n#Primer pair $primerid: $pcrprod bp\n"; $seqfor = $primers{"PRIMER_LEFT_$primerid\_SEQUENCE"}; $seqrev = $primers{"PRIMER_RIGHT_$primerid\_SEQUENCE"}; $tmfor = $primers{"PRIMER_LEFT_$primerid\_TM"}; $tmrev = $primers{"PRIMER_RIGHT_$primerid\_TM"}; ($hitsF, $longF, $maxF, $maxcontigF, $selfF) = blast($seqfor); ($hitsR, $longR, $maxR, $maxcontigR, $selfR) = blast($seqrev); print O ">Primer$primerid-For $tmfor [longHits=$hitsF, topHits=$longF, topHit=$maxF,$maxcontigF ($selfF)]\n$seqfor >Primer$primerid-Rev $tmrev [longHits=$hitsR, topHits=$longR, topHit=$maxR,$maxcontigR ($selfR)]\n$seqrev\n"; print ">Primer$primerid-For $tmfor [longHits=$hitsF, topHits=$longF, topHit=$maxF,$maxcontigF ($selfF)]\n$seqfor >Primer$primerid-Rev $tmrev [longHits=$hitsR, topHits=$longR, topHit=$maxR,$maxcontigR ($selfR)]\n$seqrev\n"; } # SUB ROUTINES sub blast { # Get input (sequence) and put in $s my $s = $_[0]; my $max = 0; my $maxcontig =''; my $hits= 0; my $topHits='n/a'; my %counter =(); # Run blast my @blastoutput = qx(echo $s | $blastprog -p blastn -d $multifasta -m 8 -e $evalmax); #print "|BLAST LAUNCH| >$name .. $value:$len0\n"; $len0 = length($s); foreach $line (@blastoutput) { ($query, $seqname, $percid, $len) = split(/\t/, $line); # if seqname is one of the two contigs... if ($seqname eq $contig1 or $seqname eq $contig2) { $self = "$len/$len0 $seqname"; } else { # Else we check if the alignment is dangerous (=long) $counter{$len}++; if ($len>$max) { $max = $len; $maxcontig = $seqname; } if ($len > $len0-$deltalen) { $hits++; } } #print "|BLAST OUTPUT| $ln" if ($len> $len0-2); } #print ">>BLAST $best ($max, $maxcontig)\n"; if ($counter{$max}>0) { $topHits=$counter{$max}; } return ($hits, $counter{$max}, $max, $maxcontig, $self); } sub rc { # This subroutine calculates reverse-complementary $sequence = $_[0]; $sequence = reverse($sequence); $sequence =~tr/ACGTacgt/TGCAtgca/; return $sequence } sub loadsequence { my $sequence = ''; my $contigname = $_[0]; open (CONTIG, "$contigs/$contigname.fna") || die " FATAL ERROR:\n Cant open $contigs/$contigname.fna\n"; while (<CONTIG>) { chomp; if ($_=~/>/) { # nothing to do: its the header... } else { $sequence.=$_; } } return $sequence; } sub happy_ending { $kermit = q( _._ _.-*""*-._ _._ .' `*. .*' `. . .@*" ; : "*@. . ` ' .' `. ` ' /`..-' `-..'\ . . .*" "*. ' '\`*-._ _.-*'/` ` : ` \ :`*----*'; / ' ; . \ `-.__.-' / . `. `. .' .' `. `. .' .' `-. `*--*' .-' `*-.__.-*' ); print $kermit; } |