#!/usr/bin/perl -w use strict; #Code to calculate the % composition of aa in protein sequences in a multiple fasta file #and save in .arff file format for input into Weka software if (@ARGV !=3){ die "Supply a multi fasta file, output file (.arff) and class attribute (i.e.1 or 0)\n"; } my $in_name=$ARGV[0]; my $out_name=$ARGV[1]; my $class = $ARGV[2]; my $sequence=''; my $aa=''; my @aa_code = qw /A C D E F G H I K L M N P Q R S T V W X Y/; my @cnt; my $len=0; #Opening files open(FH1,"$in_name"); open(FH2,">$out_name"); printf FH2 "\@RELATION aa_comp\n"; for ($a=0;$a<21;$a++){printf FH2 "\@ATTRIBUTE aa_%s NUMERIC\n", $aa_code[$a];} printf FH2 "\@ATTRIBUTE class {1,0}\n"; printf FH2 "\@data\n"; $/ = "\n>"; #input_record separator while($sequence=) { my @seq = ' '; chomp($sequence); $sequence =~ s/^>*.+\n//; #remove FASTA header $sequence =~ s/\n//g; #remove end lines $len=length($sequence); #exploding string into an array @seq = split('',$sequence); #setting count array values to zero for each sequence for ($a=0;$a<21;$a++){$cnt[$a]=0;} #counting up numbers of each type of aa in sequence for($a=0;$a<=$#seq;$a++){ my $i=0; foreach $aa (@aa_code){ #printf "matching aa $aa -- i is $i\n"; if($seq[$a] eq $aa) { $cnt[$i]++; last; } $i++; } } #calculating aa compositions and printing out for($a=0;$a<21;$a++){ printf FH2 "%4.2f,", ($cnt[$a]/$len)*100; } printf FH2 "%d \n", $class; } close FH1; close FH2;