#!/usr/bin/perl -w

#--------------------------------------------------------------------
-------
#
# raw2pcel
#
# Reads the RAW PM and MM values and calculates the average 
background
# and standard deviation of the background.  It does this by first 
estimating
# these values from the noise histrogram (where noise is defined as 
those
# MM values which satisfy PM - MM < epsilon).  It first estimates the 
average
# background as the peak of the histogram and the standard deviation 
from the
# left tail standard deviation.  These values then serve as initial 
estimates
# for a fit of the left tail histogram to a gaussian.  The average 
background
# is then subtracted, and the cells are flagged in the output PCELL 
file:
# S  - Saturated
# PS - Partially Saturated
# G  - Good
# N  - Noise
# A report file, .REP, is also generated with the fitted background 
values.

use Getopt::Long;

&GetOptions("i=s"    => \$raw_file,
	    "o=s"    => \$pcel_file,
	    "e=i"    => \$epsilon);

($raw_file && $pcel_file && $epsilon) || die "Usage: raw2pcel OPTIONS

where OPTIONS are:
                  -i   RAW DATA EXTRACTED FROM CDF AND CELL FILES 
(.RAW extension)
                  -o   PROCESSED CEL FILE (.PCEL extension)
		  -e   epsilon for the background PM-MM < epsilon
\n";

if (!($pcel_file =~ /PCEL/)) {
    print STDERR "Processed CEL file must have .PCEL extension!\n";
    print STDERR "Program terminating.\n";
    die;
}

use Math::Libm 'atan';
$pi = 4.0*atan(1.0);
$sq2 = sqrt(2.0);

#parameters for extraction of background mean and standard
#deviation from cell PM and MM values.

$saturation_default = 45000; #default saturation value;
#$epsilon = 50; #PM-MM < $epsilon defines "noisey" MM values.
$resolution = 1; #resolution of the noise histogram.
$sigma_fac = 0.2; #fitting MM <  + $sigma_fac * $sigma to gaussian.
$min_sd_zero = 20; #minimum number of cells with std dev = 0 to set 
max value to $saturation.
$saturation = &read_raw($raw_file); #read PM MM values.  Load 
coefficient of variation array. 
                                    #extract saturation value or use 
$saturation_default.

$median_cv = &mean_median(\@cv_array);
@cv_array = ();

#$hist_file = $pcel_file;
#$hist_file =~ s/PCEL/BKG/;
#open(BKG, ">$hist_file") || die "Can't write $hist_file.\n";
#print BKG join("\n", @MM_noise); print BKG "\n";
#close(BKG);
 
@cv_array = ();
&calc_hist(\@MM_noise); #calculate MM noise histogram.
$bckgrd_mean_0 = &calc_max_hist(); #find peak of MM noise historgram.
($bckgrd_std_0,$N_cells) = &calc_left_sigma($bckgrd_mean_0); #calc 
left tail standard dev of MM histogram.
$N_0 = $N_cells * $resolution; #calculate estimate of gaussian 
prefactor.
($bckgrd_mean_1,$bckgrd_std_1) = 
&fit_gaussian($raw_file,$bckgrd_mean_0,$bckgrd_std_0,$N_0,$sigma_fac)
; 
#fit gaussian and find new estimates of the background, standard 
deviation and gaussian prefactor for the noisey MM.
@MM_noise = ();
#cutoff values used for flagging cells.  Note these must be 
background subtracted.

$recalc_noise_cutoff = 3.0 * $bckgrd_std_1;
$noise_cutoff = 2.0 * $bckgrd_std_1; #noise cutoff for noise flag and 
recalculation of noisy values using error func.
$saturation_cutoff = $saturation - $bckgrd_mean_1; #totally saturated 
cells.
$partial_saturation_cutoff = $saturation_cutoff/(1.0 + $median_cv); 
#paritally saturated cells.

&write_output($raw_file,$pcel_file);

sub write_output {
    my($raw_file,$pcel_file) = @_;
    my($x,$y);
    $rep_file = $pcel_file;
    $rep_file =~ s/PCEL/REP/;
    print STDERR "$rep_file\n";
    open(OUT, ">$pcel_file") || die "Can't write $pcel_file.\n";
    open(IN, $raw_file) || die "Can't open $raw_file\n";
    $_ = ;
    chomp;
    $_ = ;
    chomp;
    $count_saturated = 0;
    $count_partial_saturated = 0;
    $count_noise = 0;
    $total = 0;
    $count_crosshyb = 0;
    $count_good = 0;
    $last_id = "id_not";
    while () {
	chomp;
	@input = split(/\t/, $_);
	$x = $input[2];
	$y = $input[3];
	$probe_id = $input[0];
	if ($input[1] =~ /PM/) {
	    $PM = $input[4];
	    $PM_raw = $PM;
	    $PM_id = $probe_id;
            $PM_x = $x;
	    $PM_y = $y;
	    $PM -= $bckgrd_mean_1;
	} else {
	    $MM_id = $probe_id;
	    $MM_x = $x;
	    $MM_y = $y;
	    if (($MM_x == $PM_x) && ($MM_y = $PM_y + 1) && ($PM_id eq 
$MM_id)) {
		$MM = $input[4];
		$PMMM = $PM_raw-$MM;
		$MM -= $bckgrd_mean_1;
		$total++;
		if ($PM >= $saturation_cutoff) {
		    $count_saturated++;
		    $PM_flag = "S";
		} elsif (($PM < $saturation_cutoff) && ($PM >= 
$partial_saturation_cutoff)) {
		    $count_partial_saturated++;
		    $PM_flag = "PS";
		} elsif ($PM <= $noise_cutoff) {
		    $count_noise++;
		    $PM_flag = "N";
		} else {
		    if (($PM - $MM) < $noise_cutoff) {
			$count_crosshyb++;
			$PM_flag = "CH";
		    } else {
			$count_good++;
			$PM_flag = "G";
		    }
		}
		if ($PM <= $recalc_noise_cutoff) {
		    $PM = &recalc_noise($PM,$bckgrd_std_1);
		}
		if ($MM <= $recalc_noise_cutoff) {
		    $MM = &recalc_noise($MM,$bckgrd_std_1);
		}
		if (($probe_id eq $last_id)) {
		    push(@PM_array,$PM);
		    push(@FLAG_array,$PM_flag);
	            push(@MM_array,$MM);
		    push(@PMMM_array,$PMMM);
		    $last_id = $probe_id;
		} else {
		    if ($last_id ne "id_not") {
			$num_PM = $#PM_array + 1;
			$PM_out = join("\t", @PM_array);
			print OUT "$last_id\t$num_PM\n";
			print OUT "$PM_out\n";
			$MM_out = join("\t",@MM_array);
			print OUT "$MM_out\n";
			$PMMM_out = join("\t",@PMMM_array);
			print OUT "$PMMM_out\n";
			$FLAG_out = join("\t", @FLAG_array);
			print OUT "$FLAG_out\n";
		    }
		    @PM_array = ();
		    @FLAG_array = ();
		    push(@PM_array,$PM);
		    push(@FLAG_array,$PM_flag);
		    @MM_array = ();
		    push(@MM_array,$MM);
		    @PMMM_array = ();
		    push(@PMMM_array,$PMMM);
		    $last_id = $probe_id;
		}		    
	    } else {
		print STDERR "Warning: $pcel_file not properly 
sorted!\n";
                print STDERR "Aborting execution.\n";
		die;
	    }
	}
    } #print last line of PCEL file!
    $num_PM = $#PM_array + 1;
    $PM_out = join("\t", @PM_array);
    print OUT "$last_id\t$num_PM\n";
    print OUT "$PM_out\n";
    $MM_out = join("\t",@MM_array);
    print OUT "$MM_out\n";
    $PMMM_out = join("\t",@PMMM_array);
    print OUT "$PMMM_out\n";
    $FLAG_out = join("\t", @FLAG_array);
    print OUT "$FLAG_out\n";
    
    close(IN);
    close(OUT);
    
    $prcnt_sat = &calc_percent($count_saturated,$total);
    $prcnt_psat = &calc_percent($count_partial_saturated,$total);
    $prcnt_noise = &calc_percent($count_noise,$total);
    $prcnt_crosshyb = &calc_percent($count_crosshyb,$total);
    $prcnt_good = &calc_percent($count_good,$total);
    $max_cell -= $bckgrd_mean_1;
    $min_cell -= $bckgrd_mean_1;
    $min_cell = &recalc_noise($min_cell,$bckgrd_std_1);
    open(REP, ">$rep_file") || die "Can't write $rep_file.\n";
    print REP "PCEL_File=$pcel_file\n";
    print REP "Epsilon=$epsilon\n";
    print REP "Background=$bckgrd_mean_1\n";
    print REP "Std_Dev_Background=$bckgrd_std_1\n";
    print REP "Median_Coeff_Variation=$median_cv\n";
    print REP "Percent_Saturated=$prcnt_sat\n";
    print REP "Percent_Partial_Saturated=$prcnt_psat\n";
    print REP "Percent_Noise=$prcnt_noise\n";
    print REP "Percent_CrossHyb=$prcnt_crosshyb\n";
    print REP "Percent_Good=$prcnt_good\n";
    print REP "Saturation_Cutoff=$saturation_cutoff\n";
    print REP 
"Partial_Saturation_Cutoff=$partial_saturation_cutoff\n";
    print REP "Noise_Cutoff=$noise_cutoff\n";
    print REP "Max_PM=$max_cell\n";
    print REP "Min_PM=$min_cell\n";
    close(REP);
}

sub mean_median {
    my $arrayref = shift;
    my @array = sort {$a <=> $b} @$arrayref;
    if (@array % 2) {
        return $array[@array/2];
    } else {
        return ($array[@array/2-1] + $array[@array/2]) / 2;
    }
}

sub calc_percent {
    my($value,$total) = @_;
    $value /= $total;
    return $value * 100.0;
}
    
sub recalc_noise {
    my($x,$s) = @_;
    use Math::Libm 'erf';
    $a = exp( -$x*$x / (2.0*$s*$s) );
    $b = ( 1.0 + erf( $x / ( $s * $sq2 ) ) ) / 2;
    $c = $x * $b + ( $s / sqrt(2.0 * $pi) ) * $a;
    return $c;
}

sub fit_gaussian {
    my($file,$bckgrd_0,$sigma_0,$N,$sig_fac) = @_;
    my($ave,$upper_MM,$bckgrd_1,$sigma_1,$N_1);
    $dat_file = $file . "_noise.dat";
    open(DAT, ">${dat_file}") || die "Can't write $dat_file.\n";
    $upper_MM = $bckgrd_0 + $sig_fac * $sigma_0;
    foreach $ave (keys %{$hist}) {
	if ($ave <= $upper_MM) {
	    print DAT "$ave\t$hist->{$ave}\n";
	}
    }
    close(DAT);	    
    $tmp_in = $file . ".gauss.in";
    $tmp_out = $file . ".gauss.out";
    $ps_plot = $file . ".gauss.ps";
    open(TMP, ">/tmp/${tmp_in}") || die "can't write ${tmp_in}\n";
    print TMP "x <- matrix(scan(\"$dat_file\"), ncol = 2, byrow = 
T)\n";
    print TMP "fn <- function(p) sum((x[,2] - 
(2*p[3]/(sqrt(2*pi)*p[2]))*exp(-(x[,1]-p[1])^2/(2*p[2]^2)))^2)\n";
    print TMP "out <- nlm(fn, p = c($bckgrd_0, $sigma_0, $N))\n";
    print TMP "p <- out\$estimate\n";
    print TMP "write.table(p, file = \"/tmp/$tmp_out\", quote=F, 
row.names=F, col.names=F)\n";
    print TMP "plot(x[,1],x[,2])\n";
    print TMP "xfit <- seq($MM_sorted[0], $upper_MM, .1)\n";
    print TMP "yfit <- (2*p[3]/(sqrt(2*pi)*p[2]))*exp(-(xfit-
p[1])^2/(2*p[2]^2))\n";
    print TMP "lines(spline(xfit, yfit))\n";
    print TMP "q()\n";
    print TMP "n\n";
    close(TMP);
    system("R --vsize=20M --no-save --slave < /tmp/$tmp_in\n");	
    system("mv Rplots.ps $ps_plot\n");
    system("rm /tmp/$tmp_in\n");
    open(R_OUT, "/tmp/$tmp_out") || die "can't read ${tmp_out}\n";
    $_ = ;
    chomp;
    $bckgrd_1 = $_;
    $_ = ;
    chomp;
    $sigma_1 = $_;
    $_ = ;
    chomp;
    $N_1 = $_;
    close(R_OUT);
    system("rm /tmp/$tmp_out\n");
    return($bckgrd_1,$sigma_1);
}

sub calc_max_hist {
    my($count_hist,$max,$bckgrd);
    $max = 0;
    $count_hist = 0;
    foreach $ave (keys %{$hist}) {
	$count_hist += $hist->{$ave};
	if ($hist->{$ave} > $max && $ave < 10000) {
	    $max = $hist->{$ave};
	    $bckgrd = $ave;
	}
    }
    print STDERR "count hist: $count_hist\n";
    return $bckgrd;
}

sub calc_hist {
    my($arrayref) = shift;
    my($MM,$count,$ave);
    @MM_sorted = sort { $a <=> $b } @$arrayref;
    $last_MM = $#MM_sorted;
    $tot_MM = $last_MM + 1;
    $min_MM = $MM_sorted[0];
    $max_MM = $MM_sorted[$last_MM];
    print STDERR "total MM = $tot_MM\n";
    $num_bins = int(($max_MM - $min_MM)/$resolution)+1;
    $first_MM = 0;
    for ($j=0; $j<=$num_bins; $j++) {
	$min = $min_MM + $j * $resolution;
	$max = $min + $resolution;
	$ave = ($min + $max)/2.0;
	$in_bin = 0;
	for ($i=$first_MM; $i<=$last_MM; $i++) {
	    if (($MM_sorted[$i] >= $min) && ($MM_sorted[$i] < $max) ) 
{
		$in_bin++;		
	    } else {
		$first_MM = $i;
		last;
	    }
	}
	$hist->{$ave} = $in_bin;
    }
}

sub read_raw {
    my($file) = $_[0];
    my($sat,$probe_id);
    @cv_array = ();
    open(FILE, $file) || die "cannot open $file\n";
    $_ = ;
    chomp;
    @input = split(/\t/, $_);
    $min_cell = $input[1];
    $max_cell = $input[2];
    $_ = ;
    chomp;
    @input = split(/\t/, $_);
    $num_sd_zero = $input[1];
    if (($num_sd_zero >= $min_sd_zero) || ($max_cell > 
$saturation_default)) {
	$sat = $max_cell;
    } else {
	$sat = $saturation_default;
    }
    while () {
	chomp;
	@input = split(/\t/, $_);
	$x = $input[2];
	$y = $input[3];
	$probe_id = $input[0];
	if ($input[1] =~ /PM/) {
	    $PM = $input[4];
            $PM_x = $x;
	    $PM_y = $y;
	    $SD = $input[5];
	    $PM_id = $probe_id;
	    push(@cv_array,$SD/$PM);
	} else {
	    $MM_x = $x;
	    $MM_y = $y;
	    $MM_id = $probe_id;
	    if (($MM_x == $PM_x) && ($MM_y = $PM_y + 1) && ($PM_id eq 
$MM_id)) {
		$MM = $input[4];
		$DIFF = $PM - $MM;
		if (abs($DIFF) < $epsilon) {
		    push(@MM_noise,$MM);
		}
	    } else {
		print STDERR "Warning: $file not properly sorted!\n";
                print STDERR "Aborting execution.\n";
		die;
	    }
	}
    }
    close(FILE);
    return $sat;
}
    
sub calc_left_sigma {
    my($bckgrd) = $_[0];
    my($MM,$var,$count);
    $var = 0;
    $count = 0;
    foreach $MM (@MM_sorted) {
	if ($MM < $bckgrd) {
	    $count++;
	    $var += ($MM - $bckgrd) ** 2;
	}
    }
    $var /= ($count - 1);
    return (sqrt($var),$count);
}