EukPhylo/PTL2/Scripts-GRID/guidance.v2.02/www/bioSequence_scripts_and_constants/parseFiles.pm

#!/usr/bin/perl -w

package parseFiles;

use Bio::SearchIO;
use Bio::SeqIO;
use Bio::AlignIO;
use Bio::Align::AlignI;

use strict;
#------------------------------------------------------
sub add_sequences_removed_by_cd_hit_to_rejected_report
#------------------------------------------------------
{
	my $cd_hit_clusters_file=shift;
	my $rejected_fragments_file=shift;

	unless (open (REJECTED,">>$rejected_fragments_file")) {return "extract_sequences_removed_by_cd_hit: Can't open '$rejected_fragments_file' for writing: $!";}
	unless (open (CDHIT,$cd_hit_clusters_file)) {return "extract_sequences_removed_by_cd_hit: Can't open '$cd_hit_clusters_file' for reading: $!\n";}
	my %cluster_members=();
	my $cluster_head="";
	while (my $line=<CDHIT>)
	{

		if ($line=~/^>Cluster/) # New Cluster
		{
			foreach my $cluster_member (keys %cluster_members)
			{
				print REJECTED "Fragment $cluster_member rejected: the sequence shares $cluster_members{$cluster_member} identity with $cluster_head ($cluster_head was preserved)\n";
			}
			%cluster_members=();
			$cluster_head="";
		}
		else # Clusters Members
		{
			my @line=split(/\s+/,$line);
			if ($line[3] eq "*")
			{
				$line[2] =~ s/[>]//;
				$cluster_head=$line[2];
			}
			else
			{
				$line[2] =~ s/[>]//;
				$cluster_members{$line[2]}=$line[4];
			}
		}
	}
	return "ok";
}
#----------------------------------
sub create_cd_hit_output{
#----------------------------------
    my $working_dir = shift;
    my $input_file = shift;   # fasta file with sequences to cluster
    my $output_file = shift;  # final fasta file with chosen homolougs
    my $cutoff = shift;       # the decimal fraction of identity we allow between the homolougs
    my $cd_hit_dir = shift;   # directory from where we run cd-hit program
    my $ref_cd_hit_hash = shift; # hash that holds all the chosen homolougs sequences
    my $which_server = shift;

    my ($seq, $seq_name);

    ##################
    # running cd-hit #
    ##################
#    my $cmd="./cd-hit -i $working_dir"."$input_file -o $working_dir"."$output_file -c $cutoff";
     my $cmd=$cd_hit_dir."cd-hit -i $working_dir"."$input_file -o $working_dir"."$output_file -c $cutoff";
    if (defined($which_server) and $which_server eq "from_ibis"){
	#$cmd = "ssh bioseq\@atlas.tau.ac.il '$cmd'";
         $cmd = "ssh bioseq\@lecs '$cmd'";
#        $cmd = "ssh bioseq\@biocluster 'cd $cd_hit_dir; $cmd'";
    }
    else{
        chdir($cd_hit_dir);
    }
    print "$cmd\n";
    my $ans = `$cmd`;

    unless ((-e $working_dir.$output_file)||(-z $working_dir.$output_file)){
        return ("sys", "parseFiles::create_cd_hit_output : $cmd: CD-HIT produced no output!\n");
    }
    #####################################
    # insert chosen homolougs to a hash #
    #####################################
    my $cd_hit_result  = Bio::SeqIO->new('-file' => "$working_dir"."$output_file" , '-format' => 'Fasta');
    while ( my $seqObj = $cd_hit_result->next_seq() ) {
        $seq = $seqObj->seq();
        $seq_name = $seqObj->primary_id();
	my $description = $seqObj->desc();
   #     $ref_cd_hit_hash->{$seq_name} = $seq;
   	$ref_cd_hit_hash->{$seq_name}{SEQ} = $seq;
	$ref_cd_hit_hash->{$seq_name}{DESCRIPTION} = $description;
    }
    $cd_hit_result->close();

    return "ok";

}

#----------------------------------
sub choose_homologoues_from_blast{
#----------------------------------
    my $working_dir = shift;
    my $query = shift;  # query sequence, given in Fasta Format

    my $redundancyRate = shift;  #a number between 0-100 that will decide under which % we take homolougs
    my $frag_overlap = shift;   # the maximum % of overlapping between 2 fragments of the same hit (should be given as decimal fraction
    my $min_length_percent = shift;  # the minimum percent a homoloug should have ( given as decimal fraction)
    my $min_num_of_homolougs = shift; # the minimum number of homolougs we must collect

    my $blast_output = shift;  # output of blast run, to read from, full path
    my $fasta_output = shift;  # the final homolouges file, to write to
    my $rejected_seqs = shift; # file that will have the rejected sequences and the reason for it

    my $ref_blast_hash = shift; # a hash to hold all the sequences names and their e-values

    my ($query_seq_name, $query_seq_length, $query_AAseq, $min_length, $seq_details, $s_name, $AAseq, $s_eval, $s_beg, $s_end, $s_ident,$s_description,$seq_description);
    my $seq_name_exists = "no";
    my $ans = "no";

    my %sequences = ();
    # hash of sequences names. each sequence name (unique) points to array of hashes.
    # {seq_name} => [{e_val => <e_val1>, AAseq => <AAseq1>, beg => <S1_beg>, end => <S1_end>},...,{e_val => <e_valn>, AAseq => <AAseqn>, beg => <Sn_beg>, end => <Sn_end>}]
    # seq_name : the name,
    # S1_beg : the index for the begining of the match for S1,
    # S1_end : the index to the end of the match for S1
    # e_val1 : the e-value.
    # AAseq1 : the sequence itself, as read from the "Subjct" line of the HSP

    unless (open QUERY, $query){
        return ("sys", "parseFiles::choose_homologoues_from_blast : can't open file $query for reading\n");
    }
    ################################
    # Extracting Query information #
    ################################

    while (<QUERY>){
        if ($_ !~ m/>/){
            $_=~m/(\S+)/;
            $query_seq_length += length($1);
        }
    }
    close QUERY;

    ######################################################
    # defining the minimum length a homoloug should have #
    ######################################################
    $min_length = $query_seq_length*$min_length_percent; #60% the query's length

    ##################################################
    # Reading blast output and collect the homolougs #
    ##################################################

    unless (open OUT_REJECT, ">".$working_dir.$rejected_seqs){
        {return ("sys", "parseFiles::choose_homologoues_from_blast : can't open file $rejected_seqs for writing\n");}
    }
    my $searchio = new Bio::SearchIO(-format => 'blast',
                                     -file   => $blast_output);
    while( my $result = $searchio->next_result ) {  # $result is a Bio::Search::Result::ResultI object
        while( my $hit = $result->next_hit ) { # $hit is a Bio::Search::Hit::HitI object or undef if there are no more
            $s_description=$hit->description();
	    $s_name= $hit->name();
            if ($s_name =~ m/.+\|(\S+\|\S+_\S+)/) {$s_name = $1;}
            if ($s_name =~ m/.+\|(\S+_\S+)\|(\S+)/) {$s_name = $2.'|'.$1;}
            elsif ($s_name =~ m/(\S+\|\S+_\S+)/) {$s_name = $1;}
            elsif ($s_name =~ m/(\S+_\S+)\|(\S+)/) {$s_name = $2.'|'.$1;}
            $seq_name_exists = "no";
            while( my $hsp = $hit->next_hsp ) { #hsp is the next available High Scoring Pair, Bio::Search::HSP::HSPI object or null if finished
                # extracting relevant details from the fragment
                ($s_beg, $s_end) = $hsp->range("sbjct");
                $AAseq = $hsp->hit_string();
                $AAseq =~ s/-//g;
                $s_eval = $hsp->evalue();
                $s_eval =~ s/,//g;
                if ($s_eval =~ m/^e/) {$s_eval = "1".$s_eval;}
                $s_ident = $hsp->percent_identity();
                # deciding if we take the fragment
                # in case there is already a fragemnt with the same name, we do another validity test for overlapping
                if (exists $sequences{$s_name}){
                    $seq_name_exists = "yes";
                    $seq_details = $sequences{$s_name};
                    $ans = &check_if_seq_valid($redundancyRate, $min_length, $s_ident, $AAseq, $s_name);
                    if ($ans eq "yes"){
                        {$ans = &check_if_no_overlap($frag_overlap, $seq_details, $s_beg, $s_end);}
                    }
                }
                else {
                    $ans = &check_if_seq_valid($redundancyRate, $min_length, $s_ident, $AAseq, $s_name);
                }
                # after taking the info, check if the currecnt sequence is valid. If so - insert it to the hash
                if ($ans eq "yes"){
                    # in case there is more than one fragment for this seq_name: add another hash to the details array
                    if ($seq_name_exists eq "yes"){
#                        push @$seq_details, {e_val => $s_eval, AAseq => $AAseq, beg => $s_beg, end => $s_end}; #HAIM
			push @$seq_details, {e_val => $s_eval, AAseq => $AAseq, beg => $s_beg, end => $s_end, description => $s_description};
                    }
                    # in case it is the first fragment for this seq_name: insert a details array as a value for this seq_name key
                    else{
#                        $sequences{$s_name} = [{e_val => $s_eval, AAseq => $AAseq, beg => $s_beg, end => $s_end}]; #HAIM
			$sequences{$s_name} = [{e_val => $s_eval, AAseq => $AAseq, beg => $s_beg, end => $s_end, description => $s_description}];
                    }
                }
                else {print OUT_REJECT "Fragment $s_name"."_$s_beg"."_$s_end rejected: $ans\n";}
            }
        }
    }
    $searchio->close();
    close OUT_REJECT;

    ##############################################################################
    # Print the selected homolougs to a file and insert the e-value info to hash #
    ##############################################################################

    my $i;
    my $seq_frag_name;

    unless (open OUT ,">".$working_dir.$fasta_output)
        {return ("sys", "parseFiles::choose_homologoues_from_blast : can't open file $fasta_output for writing\n");}
    while (($s_name, $seq_details) = each (%sequences)){
        for ($i=0; $i<=$#$seq_details; $i++){
            $seq_frag_name = "$s_name"."_".$seq_details->[$i]{beg}."_".$seq_details->[$i]{end};
	    $seq_description= $seq_details->[$i]{description};
            #print OUT ">$seq_frag_name\n"; #HAIM
            print OUT ">$seq_frag_name | $seq_description\n";
	    print OUT $seq_details->[$i]{AAseq}."\n";
            $ref_blast_hash->{$seq_frag_name} = $seq_details->[$i]{e_val};
        }
    }
    close OUT;

    #####################################################
    # Check that the number of homolougs found is legal #
    #####################################################

    my $final_num_homolougs = (keys %$ref_blast_hash);
    my $ret = "";
    if ($final_num_homolougs == 1){
        $ret .= "only <a href=$fasta_output>one unique sequence</a> ";
    }
    elsif ($final_num_homolougs <= $min_num_of_homolougs){
        $ret .= "only <a href=$fasta_output>$final_num_homolougs unique sequences</a> ";
    }
    else{
        $ret = "ok";
    }
    if ($ret !~ /^ok/){
        my $blast_link;
        $blast_link = $1 if ($blast_output =~ /$working_dir(.+)/);
        $ret .= "were chosen from <a href=$blast_link>PSI-BLAST output</a>. (<a href =$rejected_seqs>Click here</a> if you wish to view the list of sequences which produced significant alignments in blast, but were not chosen as hits.).<br />The minimal number of sequences required for the calculation is $min_num_of_homolougs.<br>";
        return ("user", $ret);
    }
    return $ret;
}


#----------------------------------------------------------
sub choose_homologoues_from_blast_with_lower_identity_cutoff{
#----------------------------------------------------------
# Do the same a choose_homologoues_from_blast() but also apply lower boundary of %ID
    my $working_dir = shift;
    my $query = shift;  # query sequence, given in Fasta Format

    my $redundancyRate = shift;  #a number between 0-100 that will decide under which % we take homolougs
    my $frag_overlap = shift;   # the maximum % of overlapping between 2 fragments of the same hit (should be given as decimal fraction
    my $min_length_percent = shift;  # the minimum percent a homoloug should have ( given as decimal fraction)
    my $min_id_percent =shift; #the minimal persent of identity to consider as homolog (given as a number - eg 35 for 35%)
    my $min_num_of_homolougs = shift; # the minimum number of homolougs we must collect

    my $blast_output = shift;  # output of blast run, to read from, full path
    my $fasta_output = shift;  # the final homolouges file, to write to
    my $rejected_seqs = shift; # file that will have the rejected sequences and the reason for it

    my $ref_blast_hash = shift; # a hash to hold all the sequences names and their e-values

    my $ref_blast_hash_seqs_and_details = shift; # optional hash ref to hold the blast seqs and details.

	my $Nuc_Seq=shift; # does the expected seq is nucleotides
	if ($Nuc_Seq eq ""){$Nuc_Seq ="no";} # deafult is AA seq
    my ($query_seq_name, $query_seq_length, $query_AAseq, $min_length, $seq_details, $s_name, $AAseq, $s_eval, $s_beg, $s_end, $s_ident,$s_description,$seq_description);
    my $seq_name_exists = "no";
    my $ans = "no";

    my %sequences = ();
    # hash of sequences names. each sequence name (unique) points to array of hashes.
    # {seq_name} => [{e_val => <e_val1>, AAseq => <AAseq1>, beg => <S1_beg>, end => <S1_end>},...,{e_val => <e_valn>, AAseq => <AAseqn>, beg => <Sn_beg>, end => <Sn_end>}]
    # seq_name : the name,
    # S1_beg : the index for the begining of the match for S1,
    # S1_end : the index to the end of the match for S1
    # e_val1 : the e-value.
    # AAseq1 : the sequence itself, as read from the "Subjct" line of the HSP

    unless (open QUERY, $query){
        return ("sys", "parseFiles::choose_homologoues_from_blast : can't open file $query for reading\n");
    }
    ################################
    # Extracting Query information #
    ################################

    while (<QUERY>){
        if ($_ !~ m/>/){
            $_=~m/(\S+)/;
            $query_seq_length += length($1);
        }
    }
    close QUERY;

    ######################################################
    # defining the minimum length a homoloug should have #
    ######################################################
    $min_length = $query_seq_length*$min_length_percent; #60% the query's length

    ##################################################
    # Reading blast output and collect the homolougs #
    ##################################################

    unless (open OUT_REJECT, ">".$working_dir.$rejected_seqs){
        {return ("sys", "parseFiles::choose_homologoues_from_blast : can't open file $rejected_seqs for writing\n");}
    }
    my $searchio = new Bio::SearchIO(-format => 'blast',
                                     -file   => $blast_output);
    while( my $result = $searchio->next_result ) {  # $result is a Bio::Search::Result::ResultI object
        while( my $hit = $result->next_hit ) { # $hit is a Bio::Search::Hit::HitI object or undef if there are no more
            $s_description=$hit->description();
	    $s_name= $hit->name();
            if ($s_name =~ m/.+\|(\S+\|\S+_\S+)/) {$s_name = $1;}
            if ($s_name =~ m/.+\|(\S+_\S+)\|(\S+)/) {$s_name = $2.'|'.$1;}
            elsif ($s_name =~ m/(\S+\|\S+_\S+)/) {$s_name = $1;}
            elsif ($s_name =~ m/(\S+_\S+)\|(\S+)/) {$s_name = $2.'|'.$1;}
			elsif ($s_name =~ m/(\S+_\S+)\|(\S+)/) {$s_name = $2.'|'.$1;}
			elsif ($s_name =~ m/(.*)\|([A-Za-z0-9._]+)\|/) {$s_name = $2;}
            $seq_name_exists = "no";
            while( my $hsp = $hit->next_hsp ) { #hsp is the next available High Scoring Pair, Bio::Search::HSP::HSPI object or null if finished
                # extracting relevant details from the fragment
                ($s_beg, $s_end) = $hsp->range("sbjct");
                $AAseq = $hsp->hit_string();
                $AAseq =~ s/-//g;
                $s_eval = $hsp->evalue();
                $s_eval =~ s/,//g;
                if ($s_eval =~ m/^e/) {$s_eval = "1".$s_eval;}
                $s_ident = $hsp->percent_identity();
                # deciding if we take the fragment
                # in case there is already a fragemnt with the same name, we do another validity test for overlapping
                if (exists $sequences{$s_name}){
                    $seq_name_exists = "yes";
                    $seq_details = $sequences{$s_name};
                    $ans = &check_if_seq_valid_with_min_id($redundancyRate, $min_length, $min_id_percent, $s_ident, $AAseq, $s_name, $Nuc_Seq);
                    if ($ans eq "yes"){
                        {$ans = &check_if_no_overlap($frag_overlap, $seq_details, $s_beg, $s_end);}
                    }
                }
                else {
                    $ans = &check_if_seq_valid_with_min_id($redundancyRate, $min_length, $min_id_percent, $s_ident, $AAseq, $s_name,$Nuc_Seq);
                }
                # after taking the info, check if the currecnt sequence is valid. If so - insert it to the hash
                if ($ans eq "yes"){
                    # in case there is more than one fragment for this seq_name: add another hash to the details array
                    if ($seq_name_exists eq "yes"){
#                        push @$seq_details, {e_val => $s_eval, AAseq => $AAseq, beg => $s_beg, end => $s_end}; #HAIM
			push @$seq_details, {e_val => $s_eval, AAseq => $AAseq, beg => $s_beg, end => $s_end, description => $s_description};
                    }
                    # in case it is the first fragment for this seq_name: insert a details array as a value for this seq_name key
                    else{
#                        $sequences{$s_name} = [{e_val => $s_eval, AAseq => $AAseq, beg => $s_beg, end => $s_end}]; #HAIM
			$sequences{$s_name} = [{e_val => $s_eval, AAseq => $AAseq, beg => $s_beg, end => $s_end, description => $s_description}];
                    }
                }
                else {print OUT_REJECT "Fragment $s_name"."_$s_beg"."_$s_end rejected: $ans\n";}
            }
        }
    }
    $searchio->close();
    close OUT_REJECT;

    ##############################################################################
    # Print the selected homolougs to a file and insert the e-value info to hash #
    ##############################################################################

    my $i;
    my $seq_frag_name;

    unless (open OUT ,">".$working_dir.$fasta_output)
        {return ("sys", "parseFiles::choose_homologoues_from_blast : can't open file $fasta_output for writing\n");}
    while (($s_name, $seq_details) = each (%sequences)){
        for ($i=0; $i<=$#$seq_details; $i++){
            if ($s_name !~ /(\|)$/)
	    {
	    	$seq_frag_name = "$s_name"."_".$seq_details->[$i]{beg}."_".$seq_details->[$i]{end};
	    }
	    else
	    {
	    	$seq_frag_name = "$s_name"."FRAMENT_".$seq_details->[$i]{beg}."_".$seq_details->[$i]{end};
	    }
	    $seq_description= $seq_details->[$i]{description};
            #print OUT ">$seq_frag_name\n"; #HAIM
            print OUT ">$seq_frag_name | $seq_description\n";
	    print OUT $seq_details->[$i]{AAseq}."\n";
            $ref_blast_hash->{$seq_frag_name} = $seq_details->[$i]{e_val};
	    if (defined $ref_blast_hash_seqs_and_details)
	    {
	    	$ref_blast_hash_seqs_and_details->{$seq_frag_name}->{'start'}=$seq_details->[$i]{beg};
		$ref_blast_hash_seqs_and_details->{$seq_frag_name}->{'end'}=$seq_details->[$i]{end};
	    	$ref_blast_hash_seqs_and_details->{$seq_frag_name}->{'e_val'}=$seq_details->[$i]{e_val};
	    	$ref_blast_hash_seqs_and_details->{$seq_frag_name}->{'description'}=$seq_details->[$i]{description};
		$ref_blast_hash_seqs_and_details->{$seq_frag_name}->{'seq'}=$seq_details->[$i]{AAseq};
	    }
        }
    }
    close OUT;

    #####################################################
    # Check that the number of homolougs found is legal #
    #####################################################

    my $final_num_homolougs = (keys %$ref_blast_hash);
    my $ret = "";
    if ($final_num_homolougs == 1){
        $ret .= "only <a href=$fasta_output>one unique sequence</a> ";
    }
    elsif ($final_num_homolougs <= $min_num_of_homolougs){
        $ret .= "only <a href=$fasta_output>$final_num_homolougs unique sequences</a> ";
    }
    else{
        $ret = "ok";
    }
    if ($ret !~ /^ok/){
        my $blast_link;
        $blast_link = $1 if ($blast_output =~ /$working_dir(.+)/);
        $ret .= "were chosen from <a href=$blast_link>PSI-BLAST output</a>.";
	if (!-z  $working_dir.$rejected_seqs)
	{
	 	$ret.="(<a href =$rejected_seqs>Click here</a> if you wish to view the list of sequences which produced significant alignments in blast, but were not chosen as hits.).";
	}
	$ret.="<br />The minimal number of sequences required for the calculation is $min_num_of_homolougs.<br>";
        return ("user", $ret);
    }
    return $ret;
}
#----------------------------------
sub check_if_seq_valid{
# checks if a sequence read from blast is valid according to some parameters.
# Found valid - returns "yes"
#----------------------------------
#   GLOBAL INPUT
    my $redundancyRate = shift; # The maximum % of identity
    my $min_length = shift;     # The minimum length a sequence should have

#   HOMOLOUG SPECIFIC INPUT
    my $ident_percent = shift;  # percentage identity
    my $aaSeq = shift;          # the Amino Acid sequence
    my $seqName = shift;        # the sequence identifier

    my $seq_length = length($aaSeq);
    my $ans;

    #parameter1: the sequence identity is less than the idnentity percent that was defined
    if ($ident_percent >= $redundancyRate){ $ans = "identity percent $ident_percent is too big";}

    #parameter2: the sequnece length is greater than the minimum sequence length
    elsif ($seq_length<$min_length){$ans = "the sequence length $seq_length is too short" ;}

    #parameter2: the sequnece letters should be legal to rate4site
    elsif ($aaSeq!~ m/^[ACDEFGHIKLMNPQRSTVWYBZX]+$/){$ans = "illegal character was found in sequence: $seqName";}

    else {$ans="yes"};

    return $ans;
}
#----------------------------------
sub check_if_seq_AA_valid{
# checks if a sequence read from blast is valid according to some parameters.
# Found valid - returns "yes"
#----------------------------------
#   GLOBAL INPUT
    my $aaSeq = shift;          # the Amino Acid sequence
    my $seqName = shift;        # the sequence identifier
    my $ans;

    #parameter2: the sequnece letters should be legal to rate4site
    if ($aaSeq!~ m/^[ACDEFGHIKLMNPQRSTVWYBZX]+$/){$ans = "illegal character was found in sequence: $seqName";}

    else {$ans="yes"};

    return $ans;
}
#----------------------------------
sub check_if_seq_valid_with_min_id{
# checks if a sequence read from blast is valid according to some parameters.
# Found valid - returns "yes"
#----------------------------------
#   GLOBAL INPUT
    my $redundancyRate = shift; # The maximum % of identity
    my $min_length = shift;     # The minimum length a sequence should have
    my $min_id =shift; 		# The minimal % of identity

#   HOMOLOUG SPECIFIC INPUT
    my $ident_percent = shift;  # percentage identity
    my $aaSeq = shift;          # the Amino Acid sequence
    my $seqName = shift;        # the sequence identifier
    my $Nuc_Seq =shift; #OPTIONAL is the sequence expected is AA or nucleotides
	if ($Nuc_Seq eq ""){$Nuc_Seq="no";}
    my $seq_length = length($aaSeq);
    my $ans="yes";

    #parameter1: the sequence identity is not too high
    if ($ident_percent >= $redundancyRate){ $ans = "identity percent $ident_percent is too big";}

    #parameter2: the sequence identity is higher than the minium idnentity percent that was defined for homologus
    if ($ident_percent < $min_id){ $ans = "identity percent $ident_percent is too low (below $min_id)";}

    #parameter3: the sequnece length is greater than the minimum sequence length
    elsif ($seq_length<$min_length){$ans = "the sequence length $seq_length is too short" ;}

    #parameter4: the sequnece letters should be legal to rate4site
	if ($Nuc_Seq eq "no") # AA seq
	{
		if ($aaSeq!~ m/^[ACDEFGHIKLMNPQRSTVWYBZX]+$/){$ans = "illegal character was found in sequence: $seqName";}
	}


    return $ans;
}
#----------------------------------
sub check_if_no_overlap{

# tests:
# 1. seq2 is inside seq1 or seq1 is inside seq2 : exit (since we assume, according to the order HSP are written,  that the one which was already chosen has a better e-val)
# 2. seq1 starts before seq2
# 3. seq2 starts before seq1
# -> calculate the overlapping fragment
# since seq1 has better e-val, we check if the overlapping fregment is more than 10% ($max_overlap) the length of seq 1
# yes: exit (return "no"). no: check that there is no overlap for all other fragments
#
#----------------------------------
#   GLOBAL INPUT
    my $max_overlap = shift;   # the maximum % of overlapping between 2 fragments of the same hit (should be given as decimal fraction)
    my $ref_seq_details = shift;  # reference to the sequence details array of hash which holds the info for that specific fragment

#   FRAGMENT (SUBJCT) IN QUESTION SPECIFIC INPUT
    my $s_bgn = shift;          # first amino_acid index
    my $s_end = shift;          # last amino acid sequence

    my ($fragment_beg, $fragment_end, $fragment_length, $overlap_length);
    my $ans = "check_if_no_overlap : no ans was picked";

    my $i=0;
    while ($i<=$#$ref_seq_details){ # read data from each array
        $fragment_beg = $ref_seq_details->[$i]{beg};
        $fragment_end = $ref_seq_details->[$i]{end};
        $fragment_length = $fragment_end-$fragment_beg+1;

        # fragment is inside subjct or subjct is inside fragment
        if ($s_bgn <= $fragment_beg && $s_end >= $fragment_end) {
            return "previous fragment found $fragment_beg"."_$fragment_end is fully inside new fragment";
        }
        elsif ($s_bgn >= $fragment_beg && $s_end <= $fragment_end){
            return "new fragment is fully inside previous fragment found $fragment_beg"."_$fragment_end";
        }

        # fragment begins before subjct
        elsif($fragment_end<$s_end && $fragment_end>$s_bgn){
            $overlap_length = $fragment_end - $s_bgn + 1;
            if ($overlap_length > $fragment_length*$max_overlap) {
                return "overlap length of fragment is ". $overlap_length." which is greater than maximum overlap: ".($fragment_length*$max_overlap);
            }
            # when the fragment might be a good match, we can only insert it if it did not match to all the fragments
            elsif($i==$#$ref_seq_details){
                $ans = "yes";
            }
        }
        # fragment begins after subjct
        elsif($fragment_beg>$s_bgn &&  $fragment_beg < $s_end){
            $overlap_length = $s_end - $fragment_beg + 1;
            if ($overlap_length > $fragment_length*$max_overlap) {
                return "overlap length of fragment is ". $overlap_length." which is greater than maximum overlap: ".($fragment_length*$max_overlap);
                }
            # when the fragment might be a good match, we can only insert it if it did not match to all the fragments
            elsif($i==$#$ref_seq_details){
                $ans = "yes";
            }
        }
        #no overlap
        elsif($fragment_beg>=$s_end || $fragment_end<=$s_bgn){
            if($i==$#$ref_seq_details){
                $ans = "yes";
            }
        }
        $i++;
    }
    return $ans;
}
1;

#----------------------------------
sub get_blast_round{
#   reads the blast file, look for the last round found.
#   also check if converged
#
# Params:
#   blast_output_filename - name of the blast file to process
#
# Returns:
#  If error:("err", error_description)
#  If no hits found : ("no_hits")
#  else: (last_round_number, 0 or 1 if found_converged)
#----------------------------------
    my $blast_output_filename = shift;
    my $last_round_number = 0;
    my $found_converged = 0;
    my $ret = "";
# search file for last round

    unless (open(BLAST, "$blast_output_filename")){
		return("err", "cannot open file '$blast_output_filename' : $!");
	}

  	while (<BLAST>){
        if (/\*+ No hits found/){
            return("no_hits");
        }
		if(/Results from round (\d+)/){
        	$last_round_number = $1;
        }
        if(/CONVERGED!/){
        	# found last round
        	$found_converged = 1;
        	last;
        }
    }
	close BLAST;
    return ($last_round_number, $found_converged);
}
#----------------------------------
sub print_blast_according_to_round{
# prints the content of the file from blast, starting from $round_number
# Returns:
#  If error : ("err", error_description)
#  Else:  ("no_err", 0 if round not found; 1 otherwise)
#----------------------------------
    my $blast_output_filename = shift;
    my $round_number = shift; # since blast can return less rounds than requested, the blast file will be searched for last round
    my $output_filename = shift;
    my $round_found = 0;

    # open files
    unless (open(BLAST, "$blast_output_filename")){
    	return("err", "Can't open file '$blast_output_filename': $!");
    }
    unless (open(NEW_FILE, ">$output_filename")){
    	return("err", "Can't open file '$output_filename': $!");
    }
    # read blast file
    my $output_data = 1;
    my $line;
    while($line = <BLAST>){
        if($line =~ /Results from round (\d+)/){
            if($round_number == $1){
                $output_data = 1;
                $round_found = 1;
            }
            else{
                $output_data = 0;
            }
        }
        if ($output_data == 1){
            print NEW_FILE $line;
        }
    }

    # close files
    close NEW_FILE;
    close BLAST;

    return("no_err",$round_found);

}
#----------------------------------
sub sort_sequences_from_eval{
# input: reference to hash which holds sequences and their evalue
# optional: query sequence input file (to extract the query sequence and add it)
# output: a file with the top-evalued sequences out of the hash
#----------------------------------
    my $ref_to_seqs_hash = shift;
    my $ref_to_cd_hash = shift;
    my $max_num_homologs = shift;
    my $output_file = shift;
    my $input_file = shift;
    my ($s_name, $s_aa_sq, $query_name, $counter);
    my $query_AAseq = "";

    # open original seq fasta file
    if (defined($input_file) and -e $input_file){
        unless (open QUERY, $input_file) {return ("err","Can't open '$input_file' for reading $!");}

        # get query name & amino acid seq from file
        while (<QUERY>)    {
            chomp;
            if (/^>(.+)/){
                $query_name = $1;
            }
            else{
                $query_AAseq.= $_;
            }
        }
        close QUERY;
    }

    # choose best homologs and create final file final_homolougs_filename
    $counter = 1;
    unless(open FINAL, ">>".$output_file) {return ("err","Can't open '$output_file' for writing $!");}

    # write query details
    print FINAL ">$query_name\n"."$query_AAseq\n" if ($query_AAseq ne "");

    my %blast_hash = %$ref_to_seqs_hash;
    my %cd_hit_hash = %$ref_to_cd_hash;
    # write homologs
    foreach $s_name (sort { $blast_hash{$a} <=> $blast_hash{$b} } keys %blast_hash ){
        # quit if reached max number of homologs
        if ($counter > $max_num_homologs){
            last;
        }

        # write next homolog
        #if (defined($cd_hit_hash{$s_name})){ #HAIM
	if (defined($cd_hit_hash{$s_name}{SEQ})){
            #$s_aa_sq = $cd_hit_hash{$s_name};
	    $s_aa_sq = $cd_hit_hash{$s_name}{SEQ};
	    my $s_description = $cd_hit_hash{$s_name}{DESCRIPTION};
            print FINAL ">$s_name $s_description\n".$s_aa_sq."\n";
            $counter++;
        }
    }
    close FINAL;
    return ("ok");
}

#----------------------------------
#----------------------------------
sub sort_sequences_from_eval_without_cd_hit{
# input: reference to hash which holds sequences and their evalue
# optional: query sequence input file (to extract the query sequence and add it)
# output: a file with the top-evalued sequences out of the hash
#----------------------------------
	my $blast_seqs_files=shift;
	my $ref_to_seqs_hash = shift;
    my $max_num_homologs = shift;
    my $output_file = shift;
    my $input_file = shift;
    my ($s_name, $s_aa_sq, $query_name, $counter);
    my $query_AAseq = "";

    # open original seq fasta file
    if (defined($input_file) and -e $input_file){
        unless (open QUERY, $input_file) {return ("err","Can't open '$input_file' for reading $!");}

        # get query name & amino acid seq from file
        while (<QUERY>)    {
            chomp;
            if (/^>(.+)/){
                $query_name = $1;
            }
            else{
                $query_AAseq.= $_;
            }
        }
        close QUERY;
    }

    # choose best homologs and create final file final_homolougs_filename
    $counter = 1;
    unless(open FINAL, ">>".$output_file) {return ("err","Can't open '$output_file' for writing $!");}

    # write query details
    print FINAL ">$query_name\n"."$query_AAseq\n" if ($query_AAseq ne "");

	my %seqs_hash=();

	unless (open (FASTA_SEQS,$blast_seqs_files)) {return ("err","Can't open '$blast_seqs_files' for reading $!");}
	my $seq="";
	my $seq_id="";
	while (my $line=<FASTA_SEQS>)
	{
		chomp ($line);
#		print "LINE:$line\n";
		if ($line=~/^>([A-Za-z0-9._]+)\s*\|(.*)/)
		{
#			print "HEADER:$1\n";<STDIN>;
			my $seq_id=$1;
			if ($seq_id !~/[0-9]+/)
			{
				if ($line=~/^>([A-Za-z0-9._]+)\s*\|\s*([A-Za-z0-9._]+)\s*\|([A-Za-z0-9._]+)\s*|/)
				{
					$seq_id=$1."|".$2."|".$3;
				}
				else
				{
					$seq_id=$2;
				}
			}
#			print "HEADER:$1\n";<STDIN>;
			my $seq=<FASTA_SEQS>;
#			print "SEQ:$seq\n";<STDIN>;
			$seqs_hash{$seq_id}{'sequence'}=$seq;
			$seqs_hash{$seq_id}{'desc'}=$line;
#			print "$seq_id\n$seq\n";<STDIN>;
		}
	}
	close (FASTA_SEQS);
#	$in  = Bio::SeqIO->new(-file => "$blast_seqs_files",
#						   -format => 'Fasta');

##	while ( my $seq = $in->next_seq() )
##	{
##		my $seq_id=join("",$seq->id());
##		my $sequence=join("",$seq->seq());
##		my $seq_desc=join("",$seq->desc());
##		$seqs_hash{$seq_id}{'sequence'}=$sequence;
##		$seqs_hash{$seq_id}{'desc'}=$seq_desc;
##		print "$seq_id\t$seq_desc\n$seq_desc\n";
##	}

    my %blast_hash = %$ref_to_seqs_hash;
#    my %cd_hit_hash = %$ref_to_cd_hash;
    # write homologs
    foreach $s_name (sort { $blast_hash{$a} <=> $blast_hash{$b} } keys %blast_hash ){
        # quit if reached max number of homologs
        if ($counter > $max_num_homologs){
            last;
        }

        # write next homolog
        #if (defined($cd_hit_hash{$s_name})){ #HAIM
##	if (defined($cd_hit_hash{$s_name}{SEQ})){
            #$s_aa_sq = $cd_hit_hash{$s_name};
##	    $s_aa_sq = $cd_hit_hash{$s_name}{SEQ};
##	    my $s_description = $cd_hit_hash{$s_name}{DESCRIPTION};

		my $s_description=$seqs_hash{$s_name}{'desc'};
		my $s_sq=$seqs_hash{$s_name}{'sequence'};
		print FINAL "$s_description|$s_name\n$s_sq";
		print "$s_description\n$s_sq"."\n";
		$counter++;
#        }
    }
    close FINAL;
    return ("ok",$counter);
}
#-------------------------------------------------------------
sub choose_homologoues_from_blast_according_user_list{
#-------------------------------------------------------------
    my $working_dir = shift;
    my $query = shift;  # query sequence, given in Fasta Format

    my $selcted_seq_id = shift;  #a full path of file containing the id's of selected sequences

    my $blast_output = shift;  # output of blast run, to read from, full path
    my $fasta_output = shift;  # the final homolouges file, to write to

    my $ref_blast_hash = shift; # a hash to hold all the sequences names and their e-values
	my $Nuc_Seq=shift; #optional, yes if sequence is nucleotides sequence;
	if ($Nuc_Seq eq ""){$Nuc_Seq="no";}
    my ($query_seq_name, $query_seq_length, $query_AAseq, $min_length, $seq_details, $s_name, $AAseq, $s_eval, $s_beg, $s_end, $s_ident,$s_description,$seq_description);
    my $seq_name_exists = "no";
    my $ans = "no";
    my %sequences = ();
    my %User_Selected_ID=();
    my $Hit_Num=0;
    # hash of sequences names. each sequence name (unique) points to array of hashes.
    # {seq_name} => [{e_val => <e_val1>, AAseq => <AAseq1>, beg => <S1_beg>, end => <S1_end>},...,{e_val => <e_valn>, AAseq => <AAseqn>, beg => <Sn_beg>, end => <Sn_end>}]
    # seq_name : the name,
    # S1_beg : the index for the begining of the match for S1,
    # S1_end : the index to the end of the match for S1
    # e_val1 : the e-value.
    # AAseq1 : the sequence itself, as read from the "Subjct" line of the HSP


    ##########################################
    #  Reading User Selected Seq id to hash  #
    ##########################################
    unless (open USER_SELECTED_IDs,$selcted_seq_id){
    	return ("sys", "parseFiles::choose_homologoues_from_blast_according_user_list : can't open file $selcted_seq_id for reading\n");
    }
    while (my $line=<USER_SELECTED_IDs>)
    {
    	chomp ($line);
		$User_Selected_ID{$line}=1 if ($line ne "");
		print "USER:*$line*\t";
    }
    ################################
	# Extracting Query information #
    ################################
    unless (open QUERY, $query){
       	return ("sys", "parseFiles::choose_homologoues_from_blast_according_user_list : can't open file $query for reading\n");
    }

    while (<QUERY>){
    	chomp ($_);
        if ($_=~/^>(.+)/){
                $query_seq_name = $1;
            }
	elsif ($_ !~ m/>/){
            $_=~m/(\S+)/;
            $query_AAseq.= $_;
	    $query_seq_length += length($1);
        }
    }
	print "NUC:$Nuc_Seq\n";
    close QUERY;
    my $searchio = new Bio::SearchIO(-format => 'blast',
                                     -file   => $blast_output);
    while( my $result = $searchio->next_result ) {  # $result is a Bio::Search::Result::ResultI object
        while( my $hit = $result->next_hit ) { # $hit is a Bio::Search::Hit::HitI object or undef if there are no more
            $Hit_Num++;
	    my $s_name_acc; # The accession id of the seq
	    $s_description=$hit->description();
	    $s_name= $hit->name();
            if ($s_name =~ m/^.+\|([A-Za-z0-9]+)\|([A-Za-z0-9]+_[A-Za-z0-9]+)/) {$s_name = $1.'|'.$2;$s_name_acc=$2;}
            if ($s_name =~ m/^.+\|([A-Za-z0-9]+_[A-Za-z0-9]+)\|([A-Za-z0-9]+)/) {$s_name = $2.'|'.$1;$s_name_acc=$1;}
            elsif ($s_name =~ m/([A-Za-z0-9]+)(\|[A-Za-z0-9]_[A-Za-z0-9]+)/) {$s_name = $1.$2;$s_name_acc=$2;}
            elsif ($s_name =~ m/([A-Za-z0-9]+_[A-Za-z0-9]+)\|([A-Za-z0-9]+)/) {$s_name = $2.'|'.$1;$s_name_acc=$1;}
			elsif ($s_name =~ m/^.+\|([A-Za-z0-9]+_[A-Za-z0-9.]+)/) {$s_name = $1;$s_name_acc=$1;}
			elsif ($s_name =~ m/([A-Za-z0-9]+_[A-Za-z0-9]+)/) {$s_name = $1;$s_name_acc=$1;}

#			elsif ($s_name =~ m/(.*)\|([A-Za-z0-9._]+)\|/) {$s_name = $2;$s_name_acc=$2;}
			elsif ($s_name =~ m/(.*)\|([A-Za-z0-9._]+)\|(.*)?(\|)?(.*)?/){$s_name = $2;$s_name_acc=$2; if ($1 eq "pdb"){$s_name = $2.$3;$s_name_acc=$2.$3;}}
			$s_name_acc.=".BlastHit_".$Hit_Num; # Each Seq Id can appear more than once with different overlaps
			print "$s_name\t$s_name_acc\t";
			$seq_name_exists = "no";
            while( my $hsp = $hit->next_hsp ) { #hsp is the next available High Scoring Pair, Bio::Search::HSP::HSPI object or null if finished
                # extracting relevant details from the fragment
                ($s_beg, $s_end) = $hsp->range("sbjct");
                $AAseq = $hsp->hit_string();
                $AAseq =~ s/-//g;
                $s_eval = $hsp->evalue();
                $s_eval =~ s/,//g;
                if ($s_eval =~ m/^e/) {$s_eval = "1".$s_eval;}
                $s_ident = $hsp->percent_identity();
				print "$s_name_acc\n";
                if (exists  $User_Selected_ID{$s_name_acc})
				{
#HERE
					# deciding if we take the fragment
                	# in case there is already a fragemnt with the same name, we do another validity test for overlapping
					my $ans ="yes";
					if (exists $sequences{$s_name}){
                    	$seq_name_exists = "yes";
                    	$ans = &check_if_seq_AA_valid($AAseq, $s_name) if ($Nuc_Seq eq "no");
					}
                	else {
						$ans = &check_if_seq_AA_valid($AAseq, $s_name) if ($Nuc_Seq eq "no");
					}
                	# after taking the info, check if the currecnt sequence is valid. If so - insert it to the hash
                	if ($ans eq "yes"){
                    	# in case there is more than one fragment for this seq_name: add another hash to the details array
                    	if ($seq_name_exists eq "yes"){
#                       	 push @$seq_details, {e_val => $s_eval, AAseq => $AAseq, beg => $s_beg, end => $s_end}; #HAIM
							push @$seq_details, {e_val => $s_eval, AAseq => $AAseq, beg => $s_beg, end => $s_end, description => $s_description};
                    	}
                    	# in case it is the first fragment for this seq_name: insert a details array as a value for this seq_name key
                    	else{
#                       	 $sequences{$s_name} = [{e_val => $s_eval, AAseq => $AAseq, beg => $s_beg, end => $s_end}]; #HAIM
							$sequences{$s_name} = [{e_val => $s_eval, AAseq => $AAseq, beg => $s_beg, end => $s_end, description => $s_description}];
                    	}
                	}
                	else {print "Fragment $s_name"."_$s_beg"."_$s_end rejected: $ans\n";}
				}
            }
        }
    }
    $searchio->close();
#    close OUT_REJECT;

    ##############################################################################
    # Print the selected homolougs to a file and insert the e-value info to hash #
    ##############################################################################

    my $i;
    my $seq_frag_name;

     unless (open OUT ,">>".$fasta_output)
        {return ("sys", "parseFiles::choose_homologoues_from_blast_according_user_list: can't open file $fasta_output for writing $!\n");}
    ## write query details - Done Before
    #print OUT ">$query_seq_name\n"."$query_AAseq\n" if ($query_AAseq ne "");
    while (($s_name, $seq_details) = each (%sequences)){
        for ($i=0; $i<=$#$seq_details; $i++){
            $seq_frag_name = "$s_name"."_".$seq_details->[$i]{beg}."_".$seq_details->[$i]{end};
			$seq_description= $seq_details->[$i]{description};
            #print OUT ">$seq_frag_name\n"; #HAIM
            print OUT ">$seq_frag_name | $seq_description\n";
			print OUT $seq_details->[$i]{AAseq}."\n";
            $ref_blast_hash->{$seq_frag_name} = $seq_details->[$i]{e_val};
        }
    }
    close OUT;

    #####################################################
    # Check that the number of homolougs found is legal #
    #####################################################

    my $final_num_homolougs = (keys %$ref_blast_hash);
    my $ret = "";
    if ($final_num_homolougs == 1){
        $ret .= "only <a href=$fasta_output>one unique sequence</a> ";
    }
    else{
        $ret = "ok";
    }
    if ($ret !~ /^ok/){
        my $blast_link;
        $blast_link = $1 if ($blast_output =~ /$working_dir(.+)/);
        $ret .= "were chosen from <a href=$blast_link>PSI-BLAST output</a>.";
        return ("user", $ret);
    }
    return ($ret,$final_num_homolougs);
}
#----------------------------------
# Strip HTML tags - Not perfect - not working for multilines tag
sub strip_html{
	my $HTML_File=shift;
	my $PlainText_File=shift;
	unless (open HTML,$HTML_File) {return "parseFiles::strip_html: Can not Open the HTML File: $HTML_File $!";}
	unless (open PLAIN_TEXT,">$PlainText_File") {return "parseFiles::strip_html: Can not open $PlainText_File for writing $!";}
	while (my $line=<HTML>)
	{
		$line=~s/<(?:[^>'"]*|(['"]).*?\1)*>//gs;
		print PLAIN_TEXT $line;
	}
	close (HTML);
	close (PLAIN_TEXT);
	return "ok";
}
#----------------------------------
#----------------------------------