#!/usr/bin/perl -w #-------------------------------------------------------------------- ------- # # raw2pcel # # Reads the RAW PM and MM values and calculates the average background # and standard deviation of the background. It does this by first estimating # these values from the noise histrogram (where noise is defined as those # MM values which satisfy PM - MM < epsilon). It first estimates the average # background as the peak of the histogram and the standard deviation from the # left tail standard deviation. These values then serve as initial estimates # for a fit of the left tail histogram to a gaussian. The average background # is then subtracted, and the cells are flagged in the output PCELL file: # S - Saturated # PS - Partially Saturated # G - Good # N - Noise # A report file, .REP, is also generated with the fitted background values. use Getopt::Long; &GetOptions("i=s" => \$raw_file, "o=s" => \$pcel_file, "e=i" => \$epsilon); ($raw_file && $pcel_file && $epsilon) || die "Usage: raw2pcel OPTIONS where OPTIONS are: -i RAW DATA EXTRACTED FROM CDF AND CELL FILES (.RAW extension) -o PROCESSED CEL FILE (.PCEL extension) -e epsilon for the background PM-MM < epsilon \n"; if (!($pcel_file =~ /PCEL/)) { print STDERR "Processed CEL file must have .PCEL extension!\n"; print STDERR "Program terminating.\n"; die; } use Math::Libm 'atan'; $pi = 4.0*atan(1.0); $sq2 = sqrt(2.0); #parameters for extraction of background mean and standard #deviation from cell PM and MM values. $saturation_default = 45000; #default saturation value; #$epsilon = 50; #PM-MM < $epsilon defines "noisey" MM values. $resolution = 1; #resolution of the noise histogram. $sigma_fac = 0.2; #fitting MM < + $sigma_fac * $sigma to gaussian. $min_sd_zero = 20; #minimum number of cells with std dev = 0 to set max value to $saturation. $saturation = &read_raw($raw_file); #read PM MM values. Load coefficient of variation array. #extract saturation value or use $saturation_default. $median_cv = &mean_median(\@cv_array); @cv_array = (); #$hist_file = $pcel_file; #$hist_file =~ s/PCEL/BKG/; #open(BKG, ">$hist_file") || die "Can't write $hist_file.\n"; #print BKG join("\n", @MM_noise); print BKG "\n"; #close(BKG); @cv_array = (); &calc_hist(\@MM_noise); #calculate MM noise histogram. $bckgrd_mean_0 = &calc_max_hist(); #find peak of MM noise historgram. ($bckgrd_std_0,$N_cells) = &calc_left_sigma($bckgrd_mean_0); #calc left tail standard dev of MM histogram. $N_0 = $N_cells * $resolution; #calculate estimate of gaussian prefactor. ($bckgrd_mean_1,$bckgrd_std_1) = &fit_gaussian($raw_file,$bckgrd_mean_0,$bckgrd_std_0,$N_0,$sigma_fac) ; #fit gaussian and find new estimates of the background, standard deviation and gaussian prefactor for the noisey MM. @MM_noise = (); #cutoff values used for flagging cells. Note these must be background subtracted. $recalc_noise_cutoff = 3.0 * $bckgrd_std_1; $noise_cutoff = 2.0 * $bckgrd_std_1; #noise cutoff for noise flag and recalculation of noisy values using error func. $saturation_cutoff = $saturation - $bckgrd_mean_1; #totally saturated cells. $partial_saturation_cutoff = $saturation_cutoff/(1.0 + $median_cv); #paritally saturated cells. &write_output($raw_file,$pcel_file); sub write_output { my($raw_file,$pcel_file) = @_; my($x,$y); $rep_file = $pcel_file; $rep_file =~ s/PCEL/REP/; print STDERR "$rep_file\n"; open(OUT, ">$pcel_file") || die "Can't write $pcel_file.\n"; open(IN, $raw_file) || die "Can't open $raw_file\n"; $_ = ; chomp; $_ = ; chomp; $count_saturated = 0; $count_partial_saturated = 0; $count_noise = 0; $total = 0; $count_crosshyb = 0; $count_good = 0; $last_id = "id_not"; while () { chomp; @input = split(/\t/, $_); $x = $input[2]; $y = $input[3]; $probe_id = $input[0]; if ($input[1] =~ /PM/) { $PM = $input[4]; $PM_raw = $PM; $PM_id = $probe_id; $PM_x = $x; $PM_y = $y; $PM -= $bckgrd_mean_1; } else { $MM_id = $probe_id; $MM_x = $x; $MM_y = $y; if (($MM_x == $PM_x) && ($MM_y = $PM_y + 1) && ($PM_id eq $MM_id)) { $MM = $input[4]; $PMMM = $PM_raw-$MM; $MM -= $bckgrd_mean_1; $total++; if ($PM >= $saturation_cutoff) { $count_saturated++; $PM_flag = "S"; } elsif (($PM < $saturation_cutoff) && ($PM >= $partial_saturation_cutoff)) { $count_partial_saturated++; $PM_flag = "PS"; } elsif ($PM <= $noise_cutoff) { $count_noise++; $PM_flag = "N"; } else { if (($PM - $MM) < $noise_cutoff) { $count_crosshyb++; $PM_flag = "CH"; } else { $count_good++; $PM_flag = "G"; } } if ($PM <= $recalc_noise_cutoff) { $PM = &recalc_noise($PM,$bckgrd_std_1); } if ($MM <= $recalc_noise_cutoff) { $MM = &recalc_noise($MM,$bckgrd_std_1); } if (($probe_id eq $last_id)) { push(@PM_array,$PM); push(@FLAG_array,$PM_flag); push(@MM_array,$MM); push(@PMMM_array,$PMMM); $last_id = $probe_id; } else { if ($last_id ne "id_not") { $num_PM = $#PM_array + 1; $PM_out = join("\t", @PM_array); print OUT "$last_id\t$num_PM\n"; print OUT "$PM_out\n"; $MM_out = join("\t",@MM_array); print OUT "$MM_out\n"; $PMMM_out = join("\t",@PMMM_array); print OUT "$PMMM_out\n"; $FLAG_out = join("\t", @FLAG_array); print OUT "$FLAG_out\n"; } @PM_array = (); @FLAG_array = (); push(@PM_array,$PM); push(@FLAG_array,$PM_flag); @MM_array = (); push(@MM_array,$MM); @PMMM_array = (); push(@PMMM_array,$PMMM); $last_id = $probe_id; } } else { print STDERR "Warning: $pcel_file not properly sorted!\n"; print STDERR "Aborting execution.\n"; die; } } } #print last line of PCEL file! $num_PM = $#PM_array + 1; $PM_out = join("\t", @PM_array); print OUT "$last_id\t$num_PM\n"; print OUT "$PM_out\n"; $MM_out = join("\t",@MM_array); print OUT "$MM_out\n"; $PMMM_out = join("\t",@PMMM_array); print OUT "$PMMM_out\n"; $FLAG_out = join("\t", @FLAG_array); print OUT "$FLAG_out\n"; close(IN); close(OUT); $prcnt_sat = &calc_percent($count_saturated,$total); $prcnt_psat = &calc_percent($count_partial_saturated,$total); $prcnt_noise = &calc_percent($count_noise,$total); $prcnt_crosshyb = &calc_percent($count_crosshyb,$total); $prcnt_good = &calc_percent($count_good,$total); $max_cell -= $bckgrd_mean_1; $min_cell -= $bckgrd_mean_1; $min_cell = &recalc_noise($min_cell,$bckgrd_std_1); open(REP, ">$rep_file") || die "Can't write $rep_file.\n"; print REP "PCEL_File=$pcel_file\n"; print REP "Epsilon=$epsilon\n"; print REP "Background=$bckgrd_mean_1\n"; print REP "Std_Dev_Background=$bckgrd_std_1\n"; print REP "Median_Coeff_Variation=$median_cv\n"; print REP "Percent_Saturated=$prcnt_sat\n"; print REP "Percent_Partial_Saturated=$prcnt_psat\n"; print REP "Percent_Noise=$prcnt_noise\n"; print REP "Percent_CrossHyb=$prcnt_crosshyb\n"; print REP "Percent_Good=$prcnt_good\n"; print REP "Saturation_Cutoff=$saturation_cutoff\n"; print REP "Partial_Saturation_Cutoff=$partial_saturation_cutoff\n"; print REP "Noise_Cutoff=$noise_cutoff\n"; print REP "Max_PM=$max_cell\n"; print REP "Min_PM=$min_cell\n"; close(REP); } sub mean_median { my $arrayref = shift; my @array = sort {$a <=> $b} @$arrayref; if (@array % 2) { return $array[@array/2]; } else { return ($array[@array/2-1] + $array[@array/2]) / 2; } } sub calc_percent { my($value,$total) = @_; $value /= $total; return $value * 100.0; } sub recalc_noise { my($x,$s) = @_; use Math::Libm 'erf'; $a = exp( -$x*$x / (2.0*$s*$s) ); $b = ( 1.0 + erf( $x / ( $s * $sq2 ) ) ) / 2; $c = $x * $b + ( $s / sqrt(2.0 * $pi) ) * $a; return $c; } sub fit_gaussian { my($file,$bckgrd_0,$sigma_0,$N,$sig_fac) = @_; my($ave,$upper_MM,$bckgrd_1,$sigma_1,$N_1); $dat_file = $file . "_noise.dat"; open(DAT, ">${dat_file}") || die "Can't write $dat_file.\n"; $upper_MM = $bckgrd_0 + $sig_fac * $sigma_0; foreach $ave (keys %{$hist}) { if ($ave <= $upper_MM) { print DAT "$ave\t$hist->{$ave}\n"; } } close(DAT); $tmp_in = $file . ".gauss.in"; $tmp_out = $file . ".gauss.out"; $ps_plot = $file . ".gauss.ps"; open(TMP, ">/tmp/${tmp_in}") || die "can't write ${tmp_in}\n"; print TMP "x <- matrix(scan(\"$dat_file\"), ncol = 2, byrow = T)\n"; print TMP "fn <- function(p) sum((x[,2] - (2*p[3]/(sqrt(2*pi)*p[2]))*exp(-(x[,1]-p[1])^2/(2*p[2]^2)))^2)\n"; print TMP "out <- nlm(fn, p = c($bckgrd_0, $sigma_0, $N))\n"; print TMP "p <- out\$estimate\n"; print TMP "write.table(p, file = \"/tmp/$tmp_out\", quote=F, row.names=F, col.names=F)\n"; print TMP "plot(x[,1],x[,2])\n"; print TMP "xfit <- seq($MM_sorted[0], $upper_MM, .1)\n"; print TMP "yfit <- (2*p[3]/(sqrt(2*pi)*p[2]))*exp(-(xfit- p[1])^2/(2*p[2]^2))\n"; print TMP "lines(spline(xfit, yfit))\n"; print TMP "q()\n"; print TMP "n\n"; close(TMP); system("R --vsize=20M --no-save --slave < /tmp/$tmp_in\n"); system("mv Rplots.ps $ps_plot\n"); system("rm /tmp/$tmp_in\n"); open(R_OUT, "/tmp/$tmp_out") || die "can't read ${tmp_out}\n"; $_ = ; chomp; $bckgrd_1 = $_; $_ = ; chomp; $sigma_1 = $_; $_ = ; chomp; $N_1 = $_; close(R_OUT); system("rm /tmp/$tmp_out\n"); return($bckgrd_1,$sigma_1); } sub calc_max_hist { my($count_hist,$max,$bckgrd); $max = 0; $count_hist = 0; foreach $ave (keys %{$hist}) { $count_hist += $hist->{$ave}; if ($hist->{$ave} > $max && $ave < 10000) { $max = $hist->{$ave}; $bckgrd = $ave; } } print STDERR "count hist: $count_hist\n"; return $bckgrd; } sub calc_hist { my($arrayref) = shift; my($MM,$count,$ave); @MM_sorted = sort { $a <=> $b } @$arrayref; $last_MM = $#MM_sorted; $tot_MM = $last_MM + 1; $min_MM = $MM_sorted[0]; $max_MM = $MM_sorted[$last_MM]; print STDERR "total MM = $tot_MM\n"; $num_bins = int(($max_MM - $min_MM)/$resolution)+1; $first_MM = 0; for ($j=0; $j<=$num_bins; $j++) { $min = $min_MM + $j * $resolution; $max = $min + $resolution; $ave = ($min + $max)/2.0; $in_bin = 0; for ($i=$first_MM; $i<=$last_MM; $i++) { if (($MM_sorted[$i] >= $min) && ($MM_sorted[$i] < $max) ) { $in_bin++; } else { $first_MM = $i; last; } } $hist->{$ave} = $in_bin; } } sub read_raw { my($file) = $_[0]; my($sat,$probe_id); @cv_array = (); open(FILE, $file) || die "cannot open $file\n"; $_ = ; chomp; @input = split(/\t/, $_); $min_cell = $input[1]; $max_cell = $input[2]; $_ = ; chomp; @input = split(/\t/, $_); $num_sd_zero = $input[1]; if (($num_sd_zero >= $min_sd_zero) || ($max_cell > $saturation_default)) { $sat = $max_cell; } else { $sat = $saturation_default; } while () { chomp; @input = split(/\t/, $_); $x = $input[2]; $y = $input[3]; $probe_id = $input[0]; if ($input[1] =~ /PM/) { $PM = $input[4]; $PM_x = $x; $PM_y = $y; $SD = $input[5]; $PM_id = $probe_id; push(@cv_array,$SD/$PM); } else { $MM_x = $x; $MM_y = $y; $MM_id = $probe_id; if (($MM_x == $PM_x) && ($MM_y = $PM_y + 1) && ($PM_id eq $MM_id)) { $MM = $input[4]; $DIFF = $PM - $MM; if (abs($DIFF) < $epsilon) { push(@MM_noise,$MM); } } else { print STDERR "Warning: $file not properly sorted!\n"; print STDERR "Aborting execution.\n"; die; } } } close(FILE); return $sat; } sub calc_left_sigma { my($bckgrd) = $_[0]; my($MM,$var,$count); $var = 0; $count = 0; foreach $MM (@MM_sorted) { if ($MM < $bckgrd) { $count++; $var += ($MM - $bckgrd) ** 2; } } $var /= ($count - 1); return (sqrt($var),$count); }