Perlscr

From FreeBio

(Difference between revisions)
Revision as of 14:09, 29 November 2005
Msommer (Talk | contribs)
Determining positions of deviations from reference genome from an clustalw .aln file
? Previous diff
Current revision
Msommer (Talk | contribs)
Making variability map from copy genome difference file format
Line 60: Line 60:
-<\pre>+</pre>
==Determining positions of deviations from reference genome from an clustalw .aln file == ==Determining positions of deviations from reference genome from an clustalw .aln file ==

Current revision

Contents

Making variability map from copy genome difference file format

The perl script takes as input a file containing a number of genome sequences in difference format. It reads in the positions of mutations and orders them and writes them to a file

The perl script: varmap.pl

#!/usr/bin/perl -w

use Bio::Perl;

my $datafile = 'allcopyseq.rdat'; # file containing the position of mutations and there 

type

my $varmapfile ='allcopyseq.rdat';

# read in strings 
my $count = 1;
my @muts;
open(FILE, $datafile);
while (<FILE>) {
        push @muts, [$_]; 	# thus each sequence of mutations is listed in the [n][0] 

 					# @muts element of
	$count++; 
    }
close(FILE);

## Split up positions of mutations into an array
my @seqpos;
my $k = 1;
while ($k<$count) {
	my @seqmut;
	@seqmut = split(/ /,$muts[$k][0]);
	my $i=0;
	$i = $#seqmut; 

	## Make an array that only contains the positions of the mutations

	my @mutpos;

	@mutpos=@seqmut;
	$j = 0;
	while ($j<$i){
		my $sp;
		$sp =$i - $j;
		splice(@mutpos, $sp, 1);
		$j=$j+2;
		}
	@seqpos = (@seqpos, @mutpos);
	$k=$k+2;
	}
my @varmap;
@varmap = sort { $a <=> $b } @seqpos;
print "@varmap\n";

open(DAT,">$varmapfile") || die("Cannot Open File");
print DAT "@varmap\n";
close(DAT);



Determining positions of deviations from reference genome from an clustalw .aln file

The script reads in a clustalw alignment file and creates a string that contains the results of the alignment (That is where the sequences agree and disagree). The positions of disagreement are found and are written to a file including the string of alignment results.

The perl script: readalign.pl

#!/usr/bin/perl -w


use Bio::Perl;

my $filename = 'xxx.aln'; # clustalw alignment input file
my $nseq = 2; #Number of sequences that have been aligned
my $align; # variable containing the alignment code only
my $rawalign = 'xxx.raln', # Output filename for raw alignment code
my $mut; # variable containing sites of mutations
##1 READ IN RAW ALIGNMENT SEQUENCE

# Open clustal alignment file 
open(FILE, $filename); #Open file containing SARS genome
# Go through alignment file headers
<FILE>; 
<FILE>;
<FILE>;
while(<FILE>) {
	for ($count=1; $count<$nseq +1; $count++){
		$_=<FILE>; # Do not read in the sequences from the alignment file
		print "$_";

	}
	$_ =substr($_, 16); # remove blanks before the alignment code
	chomp $_; # remove newline at the end of alignment code 
	$align = $align . $_; # Append to alignment string containing raw alingment code 
	close(FILE);
my $i = -0.01;
my $pos = 0;
while ($i>=-0.1) {
  $i = index($align, " ", $pos);
  if ($i >= 0) {
    $pos = $i + 1;
    $mut .= " $i,";
  } 
}

# Write raw alignment data and positions of mutations to file as specified by the variable $rawalign
open(DAT,">$rawalign") || die("Cannot Open File");

if ($mut) {
  chop $mut;
  print DAT "Mutation site(s) at index:$mut\n";
} else {
  print DAT "No mutations found.\n";
}

print DAT "$align\n"; 
close(DAT);

Input file example

CLUSTAL W (1.83) multiple sequence alignment


U00096          AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTC
U00296          AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTC
U00196          AGCTTTTCATTCTGACTGCAACGGGCAATATGTCTCTGTGTGGATTAAAAAAAGAGTGTC
                ************************************************************

U00096          TGATAGCAGCTTCTGAACTGGTTACCTGCCGTGAGTAAATTAAAATTTTATTGACTTAGG
U00296          TGATAGCAGCTTCTGAACTGGTTACCTGCCGTGAGTAAATTAAAATTTTATTGACTTAGG
U00196          TGATAGCAGCTTCTGAACTGGTTACCTGCCGTGAGTAAATTAAAATTTTATTGACTTAGG
                ************************************************************

U00096          TCACTAAATACTTTAACCAATATAGGCATAGCGCACAGACAGATAAAAATTACAGAGTAC
U00296          TCACTAAATACTTTAACCAATATAGGCATAGCGCACAGACAGATAAAAATTACAGAGTAC
U00196          TCACTAAATACTTTAACCAATATAGGCATAGCGCACAGACAGATAAAAATTACAGAGTAC
                ************************************************************

U00096          ACAACATCCATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCACAGGT
U00296          ACAACATCCATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCACAGGT
U00196          ACAACATCCATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCACAGGT
                ************************************************************

U00096          AACGGTGCGGGCTGACGCGTACAGGAAACACAGAAAAAAGCCCGCACCTGACAGTGCGGG
U00296          AACGGTGCGGGCTGACGCGTACAGGAAACACAGAAAAAAGCCCGCACCTGACAGTGCGGG
U00196          AACGGTGCGGGCTGACGCGTACAGGAAACACAGAAAAAAGCCCGCACCTGACAGTGCGGG
                ************************************************************

U00096          CTTTTTTTTTCGACCAAAGGTAACGAGGTAACAACCATGCGAGTGTTGAAGTTCGGCGGT
U00296          CTTTTTTTTTCGACCAAAGGTAACGAGGTAACAACCATGCGAGTGTTGAAGTTCGGCGGT
U00196          CTTTTTTTTTCGACCAAAGGTAACGAGGTAACAACCATGCGAGTGTTGAAGTTCGGCGGT
                ************************************************************

U00096          ACATCAGTGGCAAATGCAGAACGTTTTCTGCGTGTTGCCGATATTCTGGAAAGCAATGCC
U00296          ACATCAGTGGCAAATGCAGAACGTTTTCTGCGTGTTGCCGATATTCTGGAAAGCAATGCC
U00196          ACATCAGTGGCAAATGCAGAACGTTTTCTGCGTGTTGCCGATATTCTGGAAAGCAATGCC
                ************************************************************

U00096          AGGCAGGGGCAGGTGGCCACCGTCCTCTCTGCCCCCGCCAAAATCACCAACCACCTGGTG
U00296          AGGCAGGGGCAGGTGGCCACCGTCCTCTCTGCCCCCGCCAAAATCACCAACCACCTGGTG
U00196          AGGCAGGGGCAGGTGGCCACCGTCCTCTCTGCCCCCGCCAAAATCACCAACCACCTGGTG
                ************************************************************

U00096          GCGATGATTGAAAAAACCATTAGCGGCCAGGATGCTTTACCCAATATCAGCGATGCCGAA
U00296          GCGATGATTGAAAAAACCATTAGCGGCCAGGATGCTTTACCCAATATCAGCGATGCCGAA
U00196          GCGATGATTGAAAAAACCATTAGCGGCCAGGATGCTTTACCCAATATCAGCGATGCCGAA
                ************************************************************

U00096          CGTATTTTTGCCGAACTTTTGACGGGACTCGCCGCCGCCCAGCCGGGGTTCCCGCTGGCG
U00296          CGTATTTTTGCCGAACTTTTGACGGGACTCGCCGCCGCCCAGCCGGGGTTCCCGCTGGCG
U00196          CGTATTTTTGCCGAACTTTTGACGGGACTCGCCGCCGCCCAGCCGGGGTTCCCGCTGGCG
                ************************************************************

U00096          CAATTGAAAACTTTCGTCGATCAGGAATTTGCCCAA------------------ATAAAA
U00296          CAATTGAAAACTTTCGTCGATCAGGAATTTGCCCAAGATCAGGAATTTGCCCAAATAAAA
U00196          CAATTGAAAACTTTCGTCGATCAGGAATTTGCCCAA------------------ATAAAA
                ************************************                  ******

U00096          CATGTCCTGCATGGCATTAGTTTGTTGGGGCAGTGCCCGGATAGCATCAACGCTGCGCTG
U00296          CATGTCCTGCATGGCATTAGTTTGTTGGGGCAGTGCCCGGATAGCATCAACGCTGCGCTG
U00196          CATGTCCTGCATGGCATTAGTTTGTTGGGGCAGTGCCCGGATAGCATCAACGCTGCGCTG
                ************************************************************

U00096          ATTTGCCGTGGCGAGAAAATGTCGATCGCCATTATGGCCGGCGTATTAGAAGCGCGCGGT
U00296          ATTTGCCGTGGCGAGAAAATGTCGATCGCCATTATGGCCGGCGTATTAGAAGCGCGCGGT
U00196          ATTTGCCGTGGCGAGAAAATGTCGATCGCCATTATGGCCGGCGTATTAGAAGCGCGCGGT
                ************************************************************

U00096          CACAACGTTACTGTTATCGATCCGGTCGAAAAACTGCTGGCAGTGGGGCATTACCTCGAA
U00296          CACAACGTTACTGTTATCGATCCGGTCGAAAAACTGCTGGCAGTGGGGCATTACCTCGAA
U00196          CACAACGTTACTGTTATCGATCCAGTCGAAAAACTGCTGGCAGTGGGGCATTACCTCGAA
                *********************** ************************************

U00096          TCTACCGTCGATATTGCT
U00296          TCTACCGTCGATATTGCT
U00196          TCTACCGTCGATATTGCT
                ******************

Output file example

Mutation site(s) at index: 66, 482, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653
****************************************************************** ******************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************* *********************************************************************************************************************************************************                  ************************************************************************************************************************************************************************************************************

DNA characterization version 1.1

The script:

(##1) reads in a file containing a DNA sequence written in capital letters

(##2) determines the length of the sequence

(##3) writes the sequence to the screen

(##4) creates the reverse compliment and writes it to the screen

(##5) calculates the GC content of the sequence

(##6) finds all EcoRI restriction enzyme cut sites and writes them to the screen

#!/usr/bin/perl -w

use strict;

my $filename = 'sarsg.txt';
my $sarsl;
my $sarsg;
my $rcsarsg;
my @sarsga;
my $gcbases;
my $gc;

##1
# Open SARS genome file and read in the genome
open(FILE, $filename); #Open file containing SARS genome

while(<FILE>) {

	$sarsg = $sarsg . $_;
 }

print "$sarsg";

##2
# calculate and print length of SARS genome 

$sarsl =length $sarsg;

##3
print "DNA from $filename :$sarsl\n";

##4
# Make the reverse compliment of the sequence

$rcsarsg = reverse $sarsg;

$rcsarsg =~ tr/ATGCatgc/TACGtacg/;
 
print "\n Reverse compliment of $filename: $rcsarsg\n";

##5
# GC content

@sarsga = split('', $sarsg);

$gcbases = 0;

my $base;

foreach $base (@sarsga) {
	if ($base eq 'G'){
		++$gcbases;
	}
	elsif ($base eq 'C'){
		++$gcbases;
	}
}
$gc = $gcbases / $sarsl;
print"\nGC content in $filename: \n$gc\n";

##6 
# Find EcoRI restriction sites

my $eco;
my $pos = 0;
my $i=-0.01;
while ($i>=-0.1) {
  $i = index($sarsg, "GAATTC", $pos);
  if ($i >= 0) {
    $pos = $i + 1;
    $eco .= " $i,";
  } 
}

if ($eco) {
  chop $eco;
  print "EcoRI site(s) at index:$eco\n";
} else {
  print "No EcoRI sites found.\n";
}

DNA characterization version 1.0

#!/usr/bin/perl -w

use strict;
my $filename = 'sarsg.txt';
my $sarsl;
my $sarsg;
my $rcsarsg; 
my @sarsga;
my $gcbases;
my $gc;

# Open SARS genome file and read in the genome
open(FILE, $filename); #Open file containing SARS genome

while(<FILE>) {

	$sarsg = $sarsg . $_;
 }

print "$sarsg";


# calculate and print length of SARS genome 

$sarsl =length $sarsg;

print "DNA from $filename :$sarsl\n";


# Make the reverse compliment of the sequence

$rcsarsg = reverse $sarsg;

$rcsarsg =~ tr/ATGCatgc/TACGtacg/;
 
print "\n Reverse compliment of $filename: $rcsarsg\n";


# GC content

@sarsga = split('', $sarsg);

$gcbases = 0;

my $base;

foreach $base (@sarsga) {
	if ($base eq 'G'){
		++$gcbases;
	}
	elsif ($base eq 'C'){
		++$gcbases;
	}
}
$gc = $gcbases / $sarsl;
print"\nGC content in $filename: \n$gc\n";


# Find EcoRI restriction sites

if ($sarsg =~ /GAATTC/) {
	print "EcoRI restriction sites in $filename\n";
}