Perlscr
From FreeBio
Contents |
Making variability map from copy genome difference file format
The perl script takes as input a file containing a number of genome sequences in difference format. It reads in the positions of mutations and orders them and writes them to a file
The perl script: varmap.pl
#!/usr/bin/perl -w
use Bio::Perl;
my $datafile = 'allcopyseq.rdat'; # file containing the position of mutations and there
type
my $varmapfile ='allcopyseq.rdat';
# read in strings
my $count = 1;
my @muts;
open(FILE, $datafile);
while (<FILE>) {
push @muts, [$_]; # thus each sequence of mutations is listed in the [n][0]
# @muts element of
$count++;
}
close(FILE);
## Split up positions of mutations into an array
my @seqpos;
my $k = 1;
while ($k<$count) {
my @seqmut;
@seqmut = split(/ /,$muts[$k][0]);
my $i=0;
$i = $#seqmut;
## Make an array that only contains the positions of the mutations
my @mutpos;
@mutpos=@seqmut;
$j = 0;
while ($j<$i){
my $sp;
$sp =$i - $j;
splice(@mutpos, $sp, 1);
$j=$j+2;
}
@seqpos = (@seqpos, @mutpos);
$k=$k+2;
}
my @varmap;
@varmap = sort { $a <=> $b } @seqpos;
print "@varmap\n";
open(DAT,">$varmapfile") || die("Cannot Open File");
print DAT "@varmap\n";
close(DAT);
Determining positions of deviations from reference genome from an clustalw .aln file
The script reads in a clustalw alignment file and creates a string that contains the results of the alignment (That is where the sequences agree and disagree). The positions of disagreement are found and are written to a file including the string of alignment results.
The perl script: readalign.pl
#!/usr/bin/perl -w
use Bio::Perl;
my $filename = 'xxx.aln'; # clustalw alignment input file
my $nseq = 2; #Number of sequences that have been aligned
my $align; # variable containing the alignment code only
my $rawalign = 'xxx.raln', # Output filename for raw alignment code
my $mut; # variable containing sites of mutations
##1 READ IN RAW ALIGNMENT SEQUENCE
# Open clustal alignment file
open(FILE, $filename); #Open file containing SARS genome
# Go through alignment file headers
<FILE>;
<FILE>;
<FILE>;
while(<FILE>) {
for ($count=1; $count<$nseq +1; $count++){
$_=<FILE>; # Do not read in the sequences from the alignment file
print "$_";
}
$_ =substr($_, 16); # remove blanks before the alignment code
chomp $_; # remove newline at the end of alignment code
$align = $align . $_; # Append to alignment string containing raw alingment code
close(FILE);
my $i = -0.01;
my $pos = 0;
while ($i>=-0.1) {
$i = index($align, " ", $pos);
if ($i >= 0) {
$pos = $i + 1;
$mut .= " $i,";
}
}
# Write raw alignment data and positions of mutations to file as specified by the variable $rawalign
open(DAT,">$rawalign") || die("Cannot Open File");
if ($mut) {
chop $mut;
print DAT "Mutation site(s) at index:$mut\n";
} else {
print DAT "No mutations found.\n";
}
print DAT "$align\n";
close(DAT);
Input file example
CLUSTAL W (1.83) multiple sequence alignment
U00096 AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTC
U00296 AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTC
U00196 AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTC
************************************************************
U00096 TGATAGCAGCTTCTGAACTGGTTACCTGCCGTGAGTAAATTAAAATTTTATTGACTTAGG
U00296 TGATAGCAGCTTCTGAACTGGTTACCTGCCGTGAGTAAATTAAAATTTTATTGACTTAGG
U00196 TGATAGCAGCTTCTGAACTGGTTACCTGCCGTGAGTAAATTAAAATTTTATTGACTTAGG
************************************************************
U00096 TCACTAAATACTTTAACCAATATAGGCATAGCGCACAGACAGATAAAAATTACAGAGTAC
U00296 TCACTAAATACTTTAACCAATATAGGCATAGCGCACAGACAGATAAAAATTACAGAGTAC
U00196 TCACTAAATACTTTAACCAATATAGGCATAGCGCACAGACAGATAAAAATTACAGAGTAC
************************************************************
U00096 ACAACATCCATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCACAGGT
U00296 ACAACATCCATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCACAGGT
U00196 ACAACATCCATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCACAGGT
************************************************************
U00096 AACGGTGCGGGCTGACGCGTACAGGAAACACAGAAAAAAGCCCGCACCTGACAGTGCGGG
U00296 AACGGTGCGGGCTGACGCGTACAGGAAACACAGAAAAAAGCCCGCACCTGACAGTGCGGG
U00196 AACGGTGCGGGCTGACGCGTACAGGAAACACAGAAAAAAGCCCGCACCTGACAGTGCGGG
************************************************************
U00096 CTTTTTTTTTCGACCAAAGGTAACGAGGTAACAACCATGCGAGTGTTGAAGTTCGGCGGT
U00296 CTTTTTTTTTCGACCAAAGGTAACGAGGTAACAACCATGCGAGTGTTGAAGTTCGGCGGT
U00196 CTTTTTTTTTCGACCAAAGGTAACGAGGTAACAACCATGCGAGTGTTGAAGTTCGGCGGT
************************************************************
U00096 ACATCAGTGGCAAATGCAGAACGTTTTCTGCGTGTTGCCGATATTCTGGAAAGCAATGCC
U00296 ACATCAGTGGCAAATGCAGAACGTTTTCTGCGTGTTGCCGATATTCTGGAAAGCAATGCC
U00196 ACATCAGTGGCAAATGCAGAACGTTTTCTGCGTGTTGCCGATATTCTGGAAAGCAATGCC
************************************************************
U00096 AGGCAGGGGCAGGTGGCCACCGTCCTCTCTGCCCCCGCCAAAATCACCAACCACCTGGTG
U00296 AGGCAGGGGCAGGTGGCCACCGTCCTCTCTGCCCCCGCCAAAATCACCAACCACCTGGTG
U00196 AGGCAGGGGCAGGTGGCCACCGTCCTCTCTGCCCCCGCCAAAATCACCAACCACCTGGTG
************************************************************
U00096 GCGATGATTGAAAAAACCATTAGCGGCCAGGATGCTTTACCCAATATCAGCGATGCCGAA
U00296 GCGATGATTGAAAAAACCATTAGCGGCCAGGATGCTTTACCCAATATCAGCGATGCCGAA
U00196 GCGATGATTGAAAAAACCATTAGCGGCCAGGATGCTTTACCCAATATCAGCGATGCCGAA
************************************************************
U00096 CGTATTTTTGCCGAACTTTTGACGGGACTCGCCGCCGCCCAGCCGGGGTTCCCGCTGGCG
U00296 CGTATTTTTGCCGAACTTTTGACGGGACTCGCCGCCGCCCAGCCGGGGTTCCCGCTGGCG
U00196 CGTATTTTTGCCGAACTTTTGACGGGACTCGCCGCCGCCCAGCCGGGGTTCCCGCTGGCG
************************************************************
U00096 CAATTGAAAACTTTCGTCGATCAGGAATTTGCCCAA------------------ATAAAA
U00296 CAATTGAAAACTTTCGTCGATCAGGAATTTGCCCAAGATCAGGAATTTGCCCAAATAAAA
U00196 CAATTGAAAACTTTCGTCGATCAGGAATTTGCCCAA------------------ATAAAA
************************************ ******
U00096 CATGTCCTGCATGGCATTAGTTTGTTGGGGCAGTGCCCGGATAGCATCAACGCTGCGCTG
U00296 CATGTCCTGCATGGCATTAGTTTGTTGGGGCAGTGCCCGGATAGCATCAACGCTGCGCTG
U00196 CATGTCCTGCATGGCATTAGTTTGTTGGGGCAGTGCCCGGATAGCATCAACGCTGCGCTG
************************************************************
U00096 ATTTGCCGTGGCGAGAAAATGTCGATCGCCATTATGGCCGGCGTATTAGAAGCGCGCGGT
U00296 ATTTGCCGTGGCGAGAAAATGTCGATCGCCATTATGGCCGGCGTATTAGAAGCGCGCGGT
U00196 ATTTGCCGTGGCGAGAAAATGTCGATCGCCATTATGGCCGGCGTATTAGAAGCGCGCGGT
************************************************************
U00096 CACAACGTTACTGTTATCGATCCGGTCGAAAAACTGCTGGCAGTGGGGCATTACCTCGAA
U00296 CACAACGTTACTGTTATCGATCCGGTCGAAAAACTGCTGGCAGTGGGGCATTACCTCGAA
U00196 CACAACGTTACTGTTATCGATCCAGTCGAAAAACTGCTGGCAGTGGGGCATTACCTCGAA
*********************** ************************************
U00096 TCTACCGTCGATATTGCT
U00296 TCTACCGTCGATATTGCT
U00196 TCTACCGTCGATATTGCT
******************
Output file example
Mutation site(s) at index: 66, 482, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653 ****************************************************************** ******************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************* ********************************************************************************************************************************************************* ************************************************************************************************************************************************************************************************************
DNA characterization version 1.1
The script:
(##1) reads in a file containing a DNA sequence written in capital letters
(##2) determines the length of the sequence
(##3) writes the sequence to the screen
(##4) creates the reverse compliment and writes it to the screen
(##5) calculates the GC content of the sequence
(##6) finds all EcoRI restriction enzyme cut sites and writes them to the screen
#!/usr/bin/perl -w
use strict;
my $filename = 'sarsg.txt';
my $sarsl;
my $sarsg;
my $rcsarsg;
my @sarsga;
my $gcbases;
my $gc;
##1
# Open SARS genome file and read in the genome
open(FILE, $filename); #Open file containing SARS genome
while(<FILE>) {
$sarsg = $sarsg . $_;
}
print "$sarsg";
##2
# calculate and print length of SARS genome
$sarsl =length $sarsg;
##3
print "DNA from $filename :$sarsl\n";
##4
# Make the reverse compliment of the sequence
$rcsarsg = reverse $sarsg;
$rcsarsg =~ tr/ATGCatgc/TACGtacg/;
print "\n Reverse compliment of $filename: $rcsarsg\n";
##5
# GC content
@sarsga = split('', $sarsg);
$gcbases = 0;
my $base;
foreach $base (@sarsga) {
if ($base eq 'G'){
++$gcbases;
}
elsif ($base eq 'C'){
++$gcbases;
}
}
$gc = $gcbases / $sarsl;
print"\nGC content in $filename: \n$gc\n";
##6
# Find EcoRI restriction sites
my $eco;
my $pos = 0;
my $i=-0.01;
while ($i>=-0.1) {
$i = index($sarsg, "GAATTC", $pos);
if ($i >= 0) {
$pos = $i + 1;
$eco .= " $i,";
}
}
if ($eco) {
chop $eco;
print "EcoRI site(s) at index:$eco\n";
} else {
print "No EcoRI sites found.\n";
}
DNA characterization version 1.0
#!/usr/bin/perl -w
use strict;
my $filename = 'sarsg.txt';
my $sarsl;
my $sarsg;
my $rcsarsg;
my @sarsga;
my $gcbases;
my $gc;
# Open SARS genome file and read in the genome
open(FILE, $filename); #Open file containing SARS genome
while(<FILE>) {
$sarsg = $sarsg . $_;
}
print "$sarsg";
# calculate and print length of SARS genome
$sarsl =length $sarsg;
print "DNA from $filename :$sarsl\n";
# Make the reverse compliment of the sequence
$rcsarsg = reverse $sarsg;
$rcsarsg =~ tr/ATGCatgc/TACGtacg/;
print "\n Reverse compliment of $filename: $rcsarsg\n";
# GC content
@sarsga = split('', $sarsg);
$gcbases = 0;
my $base;
foreach $base (@sarsga) {
if ($base eq 'G'){
++$gcbases;
}
elsif ($base eq 'C'){
++$gcbases;
}
}
$gc = $gcbases / $sarsl;
print"\nGC content in $filename: \n$gc\n";
# Find EcoRI restriction sites
if ($sarsg =~ /GAATTC/) {
print "EcoRI restriction sites in $filename\n";
}

