Good work guys! (final script solution)

Again, you outperformed! We still have to check your primers but you worked as scientist should, with care and analyzing alternative hypotesis rather than launching a couple of programs and taking note of the output. Good work!

Some of you asked for the script. Well, you deserve something more: an updated script 🙂
Did you notice the bottleneck of the script? It takes a while to load the sequence of the two contigs parsing a long file. So I decided to split the 454contigs.fna file producing 1 file for each contig. Thus knowing the name of the contig, you can directly open its file (eg. contig00323.fna).

The script (after the breack) was accordingly modified…

#!/usr/bin/perl
 
# ==CONSTANTS==
# we write here the FILENAME of the multifasta 
# file containing the contigs
 
# paths:
$multifasta = '/home/geno/tools/contigs.fna';
$contigs = '/home/geno/tools/contigs';
$primer3prog= '/home/geno/tools/primer3/src/primer3_core';
$blastprog  = '/home/geno/tools/blast/bin/blastall';
# Size of target region for primer3
$targetsize = 50;  
# Maximum contig size (we trim the contig if larger than this)
$maxcontigsize = 2500;
# Defaults:
$desiredproductsize = "300-600";
$evalmax  = 0.5;  #minimum e-value when blasting an oligo
$deltalen = 2;    #difference between oligo length and aspecific mismatch 
$optTM = 60;
$optPLEN= 20;
#  print a MESSAGE for the user listing EXPECTED PARAMETERS
print STDERR q(
 +----------------------------------------------------+
 |     __  __     __ __      __ __      __  __        |
 |     --/\- P I C K --\    /--X--\    /--/\--        |
 |     -/  \--\ /--/ \-P R I M E R S   --/  \--       |
 |          \__X__/   \__\/__/    \__X__/    \--      |
 +----------------------------------------------------+
 |   Parameters:                                      |
 |   contig1 dir1 contig2 dir2 [tm] [prLen] [pcrSize] |
 |                                                    |
 |   E.g.: contig03291 C contig09482 U                |
 +----------------------------------------------------+
);
# We populate variables with USER PARAMETERS
($contig1, $dir1, $contig2, $dir2, $userTM, $userPLEN, $userPCRSIZE) = @ARGV;
 
if ($userTM)   { $optTM = $userTM; }
if ($userPLEN) { $optPLEN= $userPLEN;}
if ($userPCRSIZE) {$desiredproductsize=$userPCRSIZE;}
 
$minTM=$optTM-1.5;
$maxTM=$optTM+1.5;
$maxPLEN = $optPLEN+1;
$minPLEN = $optPLEN-1;
 
 
unless (-e $primer3prog) {
	die " FATAL ERROR:\n Unable to locate Primer3: $primer3prog.\n";
}
unless (-e $blastprog) {
	die " FATAL ERROR:\n Unable to locate BLAST: $blastprog.\n";
}
# Last argument is empty? Then let's ask the user to set 4 parameters
if ($dir2 eq '') {
	die " FATAL ERROR:\n This program requires 4 parameters.\n";
}
 
open (O, ">log-$contig1$dir1-$contig2$dir2.txt") || die " FATAL ERROR:
 Unable to write log file \"log-$contig1$dir1-$contig2$dir2.txt\".\n";
print O "contig1\t$contig1$dir1\ncontig2\t$contig2$dir2\n";
 
# So... these are the user supplied parameters:
print STDERR "
INPUT DATA:
contig1   $contig1\t$dir1
contig2   $contig2\t$dir2
Annealing $optTM\°C [$minTM\°C - $maxTM\°C]
Primer    $optPLEN [$minPLEN - $maxPLEN]
Product   $desiredproductsize bp
";
 
$seq1 = loadsequence($contig1);
$seq2 = loadsequence($contig2);
 
# Control: did we populate both $seq1 and $seq2?
# Suppose that the user mistyped one or both contig names!
 
if (length($seq1) == 0) {
	$notfound.= "FATAL ERROR:\n\"$contig1\" was not found!\n";
}
if (length($seq2) == 0) {
	$notfound.= "FATAL ERROR:\n\"$contig2\" was not found!\n";
}
 
if ($notfound) {
	die $notfound;
}
# Calculate rev-compl (using a subroutine)
if ($dir1 eq 'C') {
	$seq1 = rc($seq1);
}
if ($dir2 eq 'C') {
	$seq2 = rc($seq2);
}
 
# Let's trim contigs if they are larger than $maxcontigsize. This will reduce
# primer3 execution time.
if (length($seq1)>$maxcontigsize) {
	print STDERR "$contig1 was reduced to $maxcontigsize bp to accelerate primer picking.\n";
	$startfrom = length($seq1)-$maxcontigsize;
	$seq1 = substr($seq1, $startfrom, $maxcontigsize);
}
 
if (length($seq2)>$maxcontigsize) {
	print STDERR "$contig2 was reduced to $maxcontigsize bp to accelerate primer picking.\n";
	$seq2 = substr($seq2, 0, $maxcontigsize);
}
 
$template = $seq1.'nn'.$seq2;
 
$start_point = length($seq1)-int($targetsize/2);
 
$primer3conf = "SEQUENCE_ID=GAP-$contig1$dir1-$contig2$dir2
PRIMER_TM_FORMULA=1
PRIMER_SALT_CORRECTIONS=1
SEQUENCE_TEMPLATE=$template
SEQUENCE_TARGET=$start_point,$targetsize
PRIMER_MIN_TM=$minTM
PRIMER_MAX_TM=$maxTM
PRIMER_OPT_TM=$optTM
PRIMER_TASK=pick_detection_primers
PRIMER_PICK_LEFT_PRIMER=1
PRIMER_PICK_INTERNAL_OLIGO=0
PRIMER_PICK_RIGHT_PRIMER=1
PRIMER_MIN_SIZE=$minPLEN
PRIMER_OPT_SIZE=$optPLEN
PRIMER_MAX_SIZE=$maxPLEN
PRIMER_MAX_NS_ACCEPTED=0
PRIMER_PRODUCT_SIZE_RANGE=$desiredproductsize
P3_FILE_FLAG=0
PRIMER_EXPLAIN_FLAG=0
=";
 
# Launch primer3
$command = "echo \"$primer3conf\" | $primer3prog";
print STDERR "Launching Primer3... ";
@primer3output = qx($command);
print STDERR "Done.\n";
 
print O ">Template-seq\n$template\n\nPRIMER3INPUT:\n$primer3conf\n\nPRIMER3OUTPUT:\n@primer3output\n";
 
print STDERR "Parsing Primer3 output...\n";
# We save relevant information from Primer3 output to the %primer hash
foreach $line (@primer3output) {
	#print "$line";
	chomp($line);
	($name, $value) = split(/=/, $line);
	if ($name=~/PRIMER_(PAIR|LEFT|RIGHT)_(\d+)_(\w+)$/) {
		$primers{$name}=$value;
		$lastprimerid = $2 if ($2 > $lastprimerid);
	}
 
}
# Analyze each primer pair 
for ($primerid = 0; $primerid <= $lastprimerid; $primerid++) {
	$pcrprod = $primers{"PRIMER_PAIR_$primerid\_PRODUCT_SIZE"};
	print "\n#Primer pair $primerid: $pcrprod bp\n";
	print O "\n#Primer pair $primerid: $pcrprod bp\n";
	$seqfor = $primers{"PRIMER_LEFT_$primerid\_SEQUENCE"};
	$seqrev = $primers{"PRIMER_RIGHT_$primerid\_SEQUENCE"};
	$tmfor  = $primers{"PRIMER_LEFT_$primerid\_TM"};
	$tmrev  = $primers{"PRIMER_RIGHT_$primerid\_TM"};
 
	($hitsF, $longF, $maxF, $maxcontigF, $selfF) = blast($seqfor);
	($hitsR, $longR, $maxR, $maxcontigR, $selfR) = blast($seqrev);
	print O ">Primer$primerid-For $tmfor [longHits=$hitsF, topHits=$longF, topHit=$maxF,$maxcontigF ($selfF)]\n$seqfor
>Primer$primerid-Rev $tmrev [longHits=$hitsR, topHits=$longR, topHit=$maxR,$maxcontigR ($selfR)]\n$seqrev\n";
 
	print ">Primer$primerid-For $tmfor [longHits=$hitsF, topHits=$longF, topHit=$maxF,$maxcontigF ($selfF)]\n$seqfor
>Primer$primerid-Rev $tmrev [longHits=$hitsR, topHits=$longR, topHit=$maxR,$maxcontigR ($selfR)]\n$seqrev\n";
}
 
# SUB ROUTINES
 
sub blast {
		# Get input (sequence) and put in $s
		my $s = $_[0];
 
		my $max = 0;
		my $maxcontig ='';
		my $hits= 0;
		my $topHits='n/a';
		my %counter =();
		# Run blast
		my @blastoutput = qx(echo $s | $blastprog -p blastn -d $multifasta -m 8 -e $evalmax);
		#print "|BLAST LAUNCH| >$name .. $value:$len0\n";
		$len0 = length($s);
		foreach $line (@blastoutput) {
			($query, $seqname, $percid, $len) = split(/\t/, $line);
 
			# if seqname is one of the two contigs...
			if ($seqname eq $contig1 or $seqname eq $contig2) {
				$self = "$len/$len0 $seqname";
			} else {
			# Else we check if the alignment is dangerous (=long)
				$counter{$len}++;
				if ($len>$max) {
					$max = $len;
					$maxcontig = $seqname;
				}
				if ($len > $len0-$deltalen) {
					$hits++;
				}
			}
			#print "|BLAST OUTPUT| $ln" if ($len> $len0-2);
		}
		#print ">>BLAST $best ($max, $maxcontig)\n";
		if ($counter{$max}>0) {
			$topHits=$counter{$max};
		}
		return ($hits, $counter{$max}, $max, $maxcontig, $self);
}
 
sub rc {
	# This subroutine calculates reverse-complementary 
 
	$sequence = $_[0];
	$sequence = reverse($sequence);
	$sequence =~tr/ACGTacgt/TGCAtgca/;
	return $sequence
}
 
sub loadsequence {
	my $sequence = '';
	my $contigname = $_[0];
 
	open (CONTIG, "$contigs/$contigname.fna") || die " FATAL ERROR:\n Cant open $contigs/$contigname.fna\n";
 
	while (<CONTIG>) {
		chomp;
		if ($_=~/>/) {
			# nothing to do: its the header...
		} else {
			$sequence.=$_;
		}
	}
 
	return $sequence;
}
sub happy_ending {
	$kermit = q(
 
                    _._ _.-*""*-._ _._                                
                  .'   `*.      .*'   `.                              
                 .  .@*" ;      : "*@.  .                            
                 ` '   .'        `.   ` '                              
                 /`..-'            `-..'\                              
                .                        .                            
                   .*"              "*.                               
               '  '\`*-._        _.-*'/`  `                           
               :  ` \   :`*----*';   / '  ;                           
                .    \   `-.__.-'   /    .                            
                 `.   `.          .'   .'                             
                   `.   `.      .'   .'                               
                     `-.  `*--*'  .-'                                 
                        `*-.__.-*'                                    
 
	);
	print $kermit;
}

#!/usr/bin/perl # ==CONSTANTS== # we write here the FILENAME of the multifasta # file containing the contigs # paths: $multifasta = '/home/geno/tools/contigs.fna'; $contigs = '/home/geno/tools/contigs'; $primer3prog= '/home/geno/tools/primer3/src/primer3_core'; $blastprog = '/home/geno/tools/blast/bin/blastall'; # Size of target region for primer3 $targetsize = 50; # Maximum contig size (we trim the contig if larger than this) $maxcontigsize = 2500; # Defaults: $desiredproductsize = "300-600"; $evalmax = 0.5; #minimum e-value when blasting an oligo $deltalen = 2; #difference between oligo length and aspecific mismatch $optTM = 60; $optPLEN= 20; # print a MESSAGE for the user listing EXPECTED PARAMETERS print STDERR q( +----------------------------------------------------+ | __ __ __ __ __ __ __ __ | | --/\- P I C K --\ /--X--\ /--/\-- | | -/ \--\ /--/ \-P R I M E R S --/ \-- | | \__X__/ \__\/__/ \__X__/ \-- | +----------------------------------------------------+ | Parameters: | | contig1 dir1 contig2 dir2 [tm] [prLen] [pcrSize] | | | | E.g.: contig03291 C contig09482 U | +----------------------------------------------------+ ); # We populate variables with USER PARAMETERS ($contig1, $dir1, $contig2, $dir2, $userTM, $userPLEN, $userPCRSIZE) = @ARGV; if ($userTM) { $optTM = $userTM; } if ($userPLEN) { $optPLEN= $userPLEN;} if ($userPCRSIZE) {$desiredproductsize=$userPCRSIZE;} $minTM=$optTM-1.5; $maxTM=$optTM+1.5; $maxPLEN = $optPLEN+1; $minPLEN = $optPLEN-1; unless (-e $primer3prog) { die " FATAL ERROR:\n Unable to locate Primer3: $primer3prog.\n"; } unless (-e $blastprog) { die " FATAL ERROR:\n Unable to locate BLAST: $blastprog.\n"; } # Last argument is empty? Then let's ask the user to set 4 parameters if ($dir2 eq '') { die " FATAL ERROR:\n This program requires 4 parameters.\n"; } open (O, ">log-$contig1$dir1-$contig2$dir2.txt") || die " FATAL ERROR: Unable to write log file \"log-$contig1$dir1-$contig2$dir2.txt\".\n"; print O "contig1\t$contig1$dir1\ncontig2\t$contig2$dir2\n"; # So... these are the user supplied parameters: print STDERR " INPUT DATA: contig1 $contig1\t$dir1 contig2 $contig2\t$dir2 Annealing $optTM\°C [$minTM\°C - $maxTM\°C] Primer $optPLEN [$minPLEN - $maxPLEN] Product $desiredproductsize bp "; $seq1 = loadsequence($contig1); $seq2 = loadsequence($contig2); # Control: did we populate both $seq1 and $seq2? # Suppose that the user mistyped one or both contig names! if (length($seq1) == 0) { $notfound.= "FATAL ERROR:\n\"$contig1\" was not found!\n"; } if (length($seq2) == 0) { $notfound.= "FATAL ERROR:\n\"$contig2\" was not found!\n"; } if ($notfound) { die $notfound; } # Calculate rev-compl (using a subroutine) if ($dir1 eq 'C') { $seq1 = rc($seq1); } if ($dir2 eq 'C') { $seq2 = rc($seq2); } # Let's trim contigs if they are larger than $maxcontigsize. This will reduce # primer3 execution time. if (length($seq1)>$maxcontigsize) { print STDERR "$contig1 was reduced to $maxcontigsize bp to accelerate primer picking.\n"; $startfrom = length($seq1)-$maxcontigsize; $seq1 = substr($seq1, $startfrom, $maxcontigsize); } if (length($seq2)>$maxcontigsize) { print STDERR "$contig2 was reduced to $maxcontigsize bp to accelerate primer picking.\n"; $seq2 = substr($seq2, 0, $maxcontigsize); } $template = $seq1.'nn'.$seq2; $start_point = length($seq1)-int($targetsize/2); $primer3conf = "SEQUENCE_ID=GAP-$contig1$dir1-$contig2$dir2 PRIMER_TM_FORMULA=1 PRIMER_SALT_CORRECTIONS=1 SEQUENCE_TEMPLATE=$template SEQUENCE_TARGET=$start_point,$targetsize PRIMER_MIN_TM=$minTM PRIMER_MAX_TM=$maxTM PRIMER_OPT_TM=$optTM PRIMER_TASK=pick_detection_primers PRIMER_PICK_LEFT_PRIMER=1 PRIMER_PICK_INTERNAL_OLIGO=0 PRIMER_PICK_RIGHT_PRIMER=1 PRIMER_MIN_SIZE=$minPLEN PRIMER_OPT_SIZE=$optPLEN PRIMER_MAX_SIZE=$maxPLEN PRIMER_MAX_NS_ACCEPTED=0 PRIMER_PRODUCT_SIZE_RANGE=$desiredproductsize P3_FILE_FLAG=0 PRIMER_EXPLAIN_FLAG=0 ="; # Launch primer3 $command = "echo \"$primer3conf\" | $primer3prog"; print STDERR "Launching Primer3... "; @primer3output = qx($command); print STDERR "Done.\n"; print O ">Template-seq\n$template\n\nPRIMER3INPUT:\n$primer3conf\n\nPRIMER3OUTPUT:\n@primer3output\n"; print STDERR "Parsing Primer3 output...\n"; # We save relevant information from Primer3 output to the %primer hash foreach $line (@primer3output) { #print "$line"; chomp($line); ($name, $value) = split(/=/, $line); if ($name=~/PRIMER_(PAIR|LEFT|RIGHT)_(\d+)_(\w+)$/) { $primers{$name}=$value; $lastprimerid = $2 if ($2 > $lastprimerid); } } # Analyze each primer pair for ($primerid = 0; $primerid <= $lastprimerid; $primerid++) { $pcrprod = $primers{"PRIMER_PAIR_$primerid\_PRODUCT_SIZE"}; print "\n#Primer pair $primerid: $pcrprod bp\n"; print O "\n#Primer pair $primerid: $pcrprod bp\n"; $seqfor = $primers{"PRIMER_LEFT_$primerid\_SEQUENCE"}; $seqrev = $primers{"PRIMER_RIGHT_$primerid\_SEQUENCE"}; $tmfor = $primers{"PRIMER_LEFT_$primerid\_TM"}; $tmrev = $primers{"PRIMER_RIGHT_$primerid\_TM"}; ($hitsF, $longF, $maxF, $maxcontigF, $selfF) = blast($seqfor); ($hitsR, $longR, $maxR, $maxcontigR, $selfR) = blast($seqrev); print O ">Primer$primerid-For $tmfor [longHits=$hitsF, topHits=$longF, topHit=$maxF,$maxcontigF ($selfF)]\n$seqfor >Primer$primerid-Rev $tmrev [longHits=$hitsR, topHits=$longR, topHit=$maxR,$maxcontigR ($selfR)]\n$seqrev\n"; print ">Primer$primerid-For $tmfor [longHits=$hitsF, topHits=$longF, topHit=$maxF,$maxcontigF ($selfF)]\n$seqfor >Primer$primerid-Rev $tmrev [longHits=$hitsR, topHits=$longR, topHit=$maxR,$maxcontigR ($selfR)]\n$seqrev\n"; } # SUB ROUTINES sub blast { # Get input (sequence) and put in $s my $s = $_[0]; my $max = 0; my $maxcontig =''; my $hits= 0; my $topHits='n/a'; my %counter =(); # Run blast my @blastoutput = qx(echo $s | $blastprog -p blastn -d $multifasta -m 8 -e $evalmax); #print "|BLAST LAUNCH| >$name .. $value:$len0\n"; $len0 = length($s); foreach $line (@blastoutput) { ($query, $seqname, $percid, $len) = split(/\t/, $line); # if seqname is one of the two contigs... if ($seqname eq $contig1 or $seqname eq $contig2) { $self = "$len/$len0 $seqname"; } else { # Else we check if the alignment is dangerous (=long) $counter{$len}++; if ($len>$max) { $max = $len; $maxcontig = $seqname; } if ($len > $len0-$deltalen) { $hits++; } } #print "|BLAST OUTPUT| $ln" if ($len> $len0-2); } #print ">>BLAST $best ($max, $maxcontig)\n"; if ($counter{$max}>0) { $topHits=$counter{$max}; } return ($hits, $counter{$max}, $max, $maxcontig, $self); } sub rc { # This subroutine calculates reverse-complementary $sequence = $_[0]; $sequence = reverse($sequence); $sequence =~tr/ACGTacgt/TGCAtgca/; return $sequence } sub loadsequence { my $sequence = ''; my $contigname = $_[0]; open (CONTIG, "$contigs/$contigname.fna") || die " FATAL ERROR:\n Cant open $contigs/$contigname.fna\n"; while (<CONTIG>) { chomp; if ($_=~/>/) { # nothing to do: its the header... } else { $sequence.=$_; } } return $sequence; } sub happy_ending { $kermit = q( _._ _.-*""*-._ _._ .' `*. .*' `. . .@*" ; : "*@. . ` ' .' `. ` ' /`..-' `-..'\ . . .*" "*. ' '\`*-._ _.-*'/` ` : ` \ :`*----*'; / ' ; . \ `-.__.-' / . `. `. .' .' `. `. .' .' `-. `*--*' .-' `*-.__.-*' ); print $kermit; }

Perl & Genomics

Bioinformatics for Genomics Course – University of Padua

Good work guys! (final script solution)

Leave a Reply Cancel reply