#!/usr/bin/perl -w

#
# cel2ratios, written by Felix Naef (2001-2002)
#
# Takes two processed cell files (.PCEL files) and computes ratios 
plus other stuff
#

use strict;
use Getopt::Long;

my($cell_file1, $cell_file2, $pval, $noisecut, $adjust, $quantile, 
$max_cells, $outfile, $noout, $sgf, $mmfl);

#defaults
$pval=0; $adjust=0; $noout=0; $sgf=0; $mmfl=0, $max_cells=24, 
$quantile=0; $noisecut=3;

&GetOptions("f1=s"    => \$cell_file1,
            "f2=s"    => \$cell_file2,
	    "o=s"     => \$outfile,
	    "noout"   => \$noout,
	    "p"       => \$pval,
	    "adj=i"   => \$adjust,
	    "sgf"   => \$sgf,
	    "max=i"   => \$max_cells,
	    "nc=f"    => \$noisecut,
	    "qu=f"    => \$quantile,
	    "mm=i"    => \$mmfl);

if ( (not defined $outfile) || substr($outfile, -4) ne ".RAT") {die 
"Output file must have .RAT extension\n

Usage: cel2ratios OPTIONS

where OPTIONS are:
                  -f1     first  .PCEL file
		  -f2     second .PCEL file
		  -o      output .RAT file
		  -noout  do not output ratio file, only output the 
.REP file
                  -p      compute Wilcoxon p-values for ratios
		  -adj    adjust (normalization), 1 for global, 2 for 
local (robust but slow cf. \$Nloess)
		  -sgf    computes the log-ratio divided by the local 
variance (robust but slow \$Nloess)
		  -mm     default is PM only; 1 for PM-MM; 2 for MM 
only; 3 for all PM,MMs  
		  -max    default=24, maximum number of probepairs 
per probeset (some probesets have more, i.e. in HG_U95)
		  -nc     default is 3, noise cutoff, probes below 
nc*sigma_eff are discarded
                  -qu     default is .5, quantile to use in LTS 
regression
\n";}

# Parameters
  
my $log2=log(2.0);

my $min_adjust=50;
my $fc_adjust=1.2;
my $tol=0.00001;	#convergence can be slow
my $max_iter=5000;

my $span = .75;		# span used in loess function
my $Nloess = 4000;	# number of randomly chosen points in loess 
[0: all]
my $loess_it = 4;       # number of iterations in loess

$max_cells *=2 if $mmfl==3;

# Open various files

$outfile =~ s/.RAT/_PM.RAT/ if not $mmfl;
$outfile =~ s/.RAT/_SUB.RAT/ if $mmfl==1;
$outfile =~ s/.RAT/_MM.RAT/ if $mmfl==2;
$outfile =~ s/.RAT/_APP.RAT/ if $mmfl==3;

my $outrep = $outfile; $outrep =~ s/RAT/REP/;
my $addrep = $outfile; $addrep =~ s/RAT/AREP/;

open(REP, ">$outrep") || die "Can't open $outrep\n";		
	#report file
open(OUT, ">$outfile") || die "Can't open $outfile\n" unless $noout;
	#output file

#write params to .REP file

print REP "PM only\n" if not $mmfl;
print REP "MM only\n" if $mmfl==2;
print REP "PM-MM \n" if $mmfl==1;
print REP "PM,MM \n" if $mmfl==3;

print REP "File1: $cell_file1\n";
print REP "File2: $cell_file2\n";
print REP "Maximum number of probepairs: $max_cells\n";
print REP "P-values: $pval\n";
print REP "Adjust: $adjust\n";
print REP "No output: $noout\n";
print REP "Local STD: $sgf\n";
print REP "Output file: $outfile\n";
print REP "Noise cutoff: $noisecut\n";
print REP "LTS Quantile: $quantile\n";
print REP "Loess span: $span\n";
print REP "NLoess: $Nloess\n";

my @files=($cell_file1, $cell_file2); my $nfiles=2;
my ($file, $pm, $flag, $n, $dummy, $nf, $mm, $pmmm);
my (@label, @pm, @mm, @pmmm, @flag, @data_cell);
my %data;

my ($sig_file, $bkg, $sig, $sig1, $sat, $sig_eff, $int, $ints);
my %sat;

$nf=0;
foreach $file (@files) {
  open(F1, $file) || die "Can't open $file\n";
  
  $sig_file = $file; $sig_file =~ s/.PCEL/.REP/;
  ($bkg, $sig, $sat)=&read_sigma_sat($sig_file);
  printf REP "$file: Bare sigma: %.3f\n", $sig;
  $sig1=$sig if $file eq $cell_file1;
  $sig_eff = $sig;
  $sig_eff = sqrt(2.0)*$sig if ($mmfl==1);
  
  $sat -= $bkg * 1.05;	#this is arbitrary for now
  
  while () {
    chomp;
    @label=split(/\t/);
    $pm=; chomp $pm;
    @pm=split(/\t/, $pm);
    $mm=; chomp $mm;
    @mm=split(/\t/, $mm);
    $pmmm=; chomp $pmmm;
    @pmmm=split(/\t/, $pmmm);
    $flag=;
    
    unless ( $mmfl==3 ) {
      unless ( $label[1] > $max_cells ) { 
	for ($n=0; $n < $label[1]; $n++) {

          my $nsat=0;
          if($mmfl==0) {
	    my $m;
	    for ($m=0; $m < $label[1]; $m++) {
	      $nsat++ if $pm[$m] > $sat;
	    }
	  }
	  
          if($mmfl==0) {
	    $sat{$label[0]} = "PM" if $nsat <= $label[1]/2;
	    $sat{$label[0]} = "MM" if $nsat > $label[1]/2;
	  }	    

          ($int, $ints) = ($pm[$n], $pm[$n]) if (not $mmfl) && $nsat 
<= $label[1]/2;
	  ($int, $ints) = ($mm[$n], $mm[$n]) if (not $mmfl) && $nsat 
>  $label[1]/2;
	  ($int, $ints) = ($pm[$n]-$mm[$n], $pm[$n]) if $mmfl == 1;
	  ($int, $ints) = ($mm[$n], $mm[$n]) if $mmfl == 2;
          	  
	  $flag = flag($ints, $sig_eff, $sat);
	  
	  @data_cell=($int, $flag);
	  $data{$label[0]} -> [$n] -> [$nf] = [@data_cell];
	}
      }
    }
    else {
      unless ( $label[1] > $max_cells/2 ) {
        for ($n=0; $n < $label[1]; $n++) {

          ($int, $ints) = ($pm[$n], $pm[$n]);
	  $flag = flag($ints, $sig_eff, $sat);
          @data_cell=($ints, $flag);
	  $data{$label[0]} -> [2*$n] -> [$nf] = [@data_cell];
	  
	  ($int, $ints) = ($mm[$n], $mm[$n]);
	  $flag = flag($ints, $sig_eff, $sat);
          @data_cell=($int, $flag);
	  $data{$label[0]} -> [2*$n+1] -> [$nf] = [@data_cell];
	}
      }
    }
  }   
  close(F1);
  $nf++;
}

# Compute ratios, run through all affy_labels

my $tmpf1 = $outfile . ".tmp";
my $tmpf2 = $outfile . ".tmpl";
my $outfile_Rout = $outfile . ".Rout";

my %above;
count_noise();
write_cells();
call_R_regression();
my %output;
read_R_regression();

unlink $tmpf1, $tmpf2, $outfile_Rout;
  
#adjust
my @all_ratios;
my ($label, $ratio);

if ($adjust) {
  foreach $label (keys %output) {
    $ratio = $output{$label} -> [2]; 
    unless ($ratio eq "NA") {
      push @all_ratios, $ratio;
    }
  }
}

# Adjust scaling by centering the ratio histogram and optionnally 
loess

my ($log_adj, $del, $n_iter, $scale_global);
my (%scale_local, %sgf);
  
if ($adjust) {
  ($log_adj, $del, $n_iter)=adjust(@all_ratios) if $cell_file1 ne 
$cell_file2;
  ($log_adj, $del, $n_iter)=(0,0,0) if $cell_file1 eq $cell_file2;
  printf REP "\nAdjusted scale (log2 base): %.3f\n", $log_adj;
  print REP "Number of iterations: $n_iter\n";
  printf REP "Delta: %.6e\n",$del;
  $scale_global=$log_adj;  
  printf REP "\nScale factor for $cell_file1: %.3f\n", 2**(-
$log_adj);  
  printf REP "\nScaled sigma 1: %.3f\n", $sig1*2**(-$log_adj);
  &scale_global();
}

my ($R_local_reg_dat, $R_local_reg, $R_out_local_reg);
if ($adjust == 2) {

  unless($cell_file1 eq $cell_file2) {
    $R_local_reg_dat = $outfile; $R_local_reg_dat =~ 
s/RAT/local.dat/;
    $R_local_reg = $outfile; $R_local_reg =~ s/RAT/local.R/;
    $R_out_local_reg = $outfile; $R_out_local_reg =~ 
s/RAT/local.R_out/;
    &local_scaling_R;
    &read_local_scaling_R;
    print REP "\nComputed local regression scaling.";  
    &scale_local;  
    unlink ($R_local_reg_dat, $R_local_reg, $R_out_local_reg);
  }
}

my (%pvals, %reg);
my ($R_script_pval, $R_script_out_pval);
my ($R_script_sgf, $R_script_sgf_dat, $R_script_out_sgf);

if ($pval) {

  $R_script_pval = $outfile; $R_script_pval =~ s/RAT/pval.R/;
  $R_script_out_pval = $outfile; $R_script_out_pval =~ 
s/RAT/pval.R_out/;
  &write_cells();
  &call_R_pvalues;
  &read_R_pvalues;
  print REP "\nComputed the paired Wilcoxin test p-values.\n";
  unlink $tmpf1, $tmpf2, $R_script_pval, $R_script_out_pval;
}

if ($sgf) {
  $R_script_sgf_dat = $outfile; $R_script_sgf_dat =~ s/RAT/sgf.dat/;
  $R_script_sgf = $outfile; $R_script_sgf =~ s/RAT/sgf.R/;
  $R_script_out_sgf = $outfile; $R_script_out_sgf =~ 
s/RAT/sgf.R_out/;
  &call_R_sgf;
  &read_R_sgf;
  print REP "Computed the local STD.\n";
  unlink $R_script_sgf_dat, $R_script_sgf, $R_script_out_sgf;
}

# Write output

my (@result1, @result);

my $line0 = "Label\tI1\tI2\tRatio\tLog2 
Ratio\tRes\tRes2\tNcells\tNabove";
$line0 .= "\tp-value" if $pval;
$line0 .= "\tLog-Ratio/Std" if $sgf;
$line0 .= "\tlocal scale" if $adjust==2;
$line0 .= "\tlocal sigma" if $sgf;
$line0 .= "\tPM|MM" if $mmfl==0;
$line0 .= "\n";


unless ($noout) {

  print OUT $line0;
  foreach $label (keys %output) {

    @result=@{$output{$label}};
  
    my $format1="%s\t";
    
    if ($result[0] ne "NA") {
      @result1=( $result[0], $result[1], 2**$result[2], 
@result[2..5]);
      $format1 .= ("%.5f\t" x 6) . "%d\t";
    }
    else {
      @result1 = qw (NA NA NA NA NA NA NA);
      $format1 .= ("%s\t" x 7);
    }
    
    push @result1, $above{$label};
    $format1 .= "%s\t";
    
    if ($pval) {              
      if ($pvals{$label} =~ /NA/) {$format1 .= "%s\t";}
      else                      {$format1 .= "%.3e\t";}
      push @result1, $pvals{$label};
    }
      
    if ($sgf) {              
      if (exists $sgf{$label}) {
        if ($sgf{$label} ne "NA" && $sgf{$label} > 0 ) {
          $format1 .= "%.3e\t";
	  push @result1, $result[2] / $sgf{$label} ;
	}
	else {
	  $format1 .= "%s\t";
	  push @result1, "NA";
	}	  
      }
      else {
        $format1 .= "%s\t";
	push @result1, "NA";
      }
    }
    
    if ($adjust==2) {
      if (exists $scale_local{$label}) {
        if ($scale_local{$label} ne "NA" ) {
          $format1 .= "%.3e\t";
	  push @result1, $scale_local{$label} ;
	}
	else {
	  $format1 .= "%s\t";
	  push @result1, "NA";
	}	  
      }
      else {
        $format1 .= "%s\t";
	push @result1, "NA";
      }
    }
    
    if ($sgf) {              
      if (exists $sgf{$label}) {
        if ($sgf{$label} ne "NA" && $sgf{$label} > 0 ) {
          $format1 .= "%.3e\t";
	  push @result1, $sgf{$label} ;
	}
	else {
	  $format1 .= "%s\t";
	  push @result1, "NA";
	}	  
      }
      else {
        $format1 .= "%s\t";
	push @result1, "NA";
      }
    }
  
    if ($mmfl == 0) {
      $format1 .= "%s\t";
      push @result1, $sat{$label};
    }
  
    chop $format1; $format1 .= "\n";      
    printf OUT $format1, $label, @result1;
  }
} 

sub mean {

  use strict;
    
  if (scalar @_ == 0) {return "NA"};
  my $mean=0;
  my $n=0;
  my $el;  
  foreach $el (@_) {
    $n++;
    $mean += $el;
  }  
  $mean /= $n;
  return $mean;
}

sub by_number {
  $a <=> $b;
}

sub by_ratio {

  use strict;
  
  my @c=split(":", $a);
  my @d=split(":", $b);
  
  $c[0] <=> $d[0];
}

sub adjust {

use strict;

if (scalar @_ < $min_adjust) {return (0, 0, 0);}

my @l_ratios=sort by_number @_; my @histo=();
my $len = scalar @l_ratios;
my $win = int(sqrt($len)/2);

my $dy = 2*$win;
my $sl_max = -1; my $i_max;
my ($i, $diff, $slope);

#open(HIS, ">hfc");

for ($i=$win; $i < $len - $win; $i+=$win) {

  $diff = $l_ratios[$i+$win] - $l_ratios[$i-$win];

  if ($diff > 0) {
    $slope = $dy / $diff; $histo[$i]=$slope;
    
    if ($slope > $sl_max) { $sl_max=$slope; $i_max=$i;}      
#    print HIS $l_ratios[$i], "\t", $slope, "\n"; 
  }
}
#close(HIS);

#center the histogram recurively

my $fc_cutoff=log($fc_adjust)/$log2;
my $delta=1; my $center_new; my $nc;
my $center=$l_ratios[$i_max];

my $n_iter=0;

while ($delta > $tol && $n_iter < $max_iter) {

  $n_iter++;
  
  $center_new=0; $nc=0;
  for ($i=0; $i < $len; $i++) {
    if ( abs($l_ratios[$i]-$center) < $fc_cutoff) {
      $center_new += $l_ratios[$i];
      $nc++;
    }
  }
  $center_new /= $nc;
  $delta = abs($center-$center_new); 
  $center = $center_new;
}

return ($center, $delta, $n_iter) ;

}

sub write_cells {

use strict;

open(TMP, ">$tmpf1") || die "Can't open $tmpf1\n";
open(TMPL, ">$tmpf2") || die "Can't open $tmpf2\n";

my $ncells;

foreach $label (keys %data) {

  print TMPL $label, "\n";
  
  my ($n, $i1, $i2, $f1, $f2);
  my @int1=(); my @int2=();
  
  $ncells = scalar @{$data{$label}};
  
  my $ngood=0;
  for ($n=0; $n < $ncells; $n++) {
  
    $i1  = $data{$label} -> [$n] -> [0] -> [0];
    $i2  = $data{$label} -> [$n] -> [1] -> [0];
    $f1 = $data{$label} -> [$n] -> [0] -> [1];
    $f2 = $data{$label} -> [$n] -> [1] -> [1];
    
    if ( ($f1 =~ /^S/ && $f2 =~ /^S/) || ($f1 =~ /^N/ && $f2 =~ /^N/) 
||
         ($i1 <= 0 || $i2 <= 0) || ($f1 =~ /NA/ || $f2 =~ /NA/) ) {

#      push @int1, "NA";
#      push @int2, "NA";
      
    }  
    else {
      $ngood++;
      push @int1, $i1;
      push @int2, $i2;
    }   	
  }
  
       
# Fill in the blanks
    
  for ($n = $ngood; $n < $max_cells; $n++) {
    push @int1, "NA";
    push @int2, "NA";
  }
  
  print TMP join("\t", @int1), "\n";
  print TMP join("\t", @int2), "\n";
}
close(TMP);
close(TMPL)
}

sub count_noise {

use strict;

my $ncells;

foreach $label (keys %data) {

  my ($f1, $f2);
  my $n1=0;
  my $n2=0;
  
  $ncells = scalar @{$data{$label}};
  
  for ($n=0; $n < $ncells; $n++) {
  
    $f1 = $data{$label} -> [$n] -> [0] -> [1];
    $f2 = $data{$label} -> [$n] -> [1] -> [1];
    
    $n1++ if $f1 !~ /^N/;
    $n2++ if $f2 !~ /^N/;      
  }
  
  $above{$label} = "$n1:$n2";  

}
}

sub call_R_pvalues {
 
use strict;

open(TMP, ">$R_script_pval") || die "Can't open \n";

print TMP "x <- matrix(scan(\"$tmpf1\"), ncol = $max_cells, byrow = 
T)\n";
print TMP "ncells<-dim(x)[2]\n";
print TMP "nprobes<-dim(x)[1] / 2\n";
print TMP "p <- 1:nprobes\n";
print TMP "for (i in 1:nprobes) { \n";
print TMP "e1<-x[2*i-1,]\n";
print TMP "e2<-x[2*i,]\n";
print TMP "if (length(e1[e1==\"NA\"]) == ncells) {\n";
print TMP "p[i] <- NA}\n";
print TMP "else {\n";
print TMP "p[i] <- wilcox.test(e1, e2, paired = T)\$p.value\n";
print TMP "}\n";
print TMP "}\n";
print TMP "write.table(p, file = \"$R_script_out_pval\", quote=F, 
row.names=F, col.names=F)\n";
close(TMP);
system("R --vsize=40M --no-save --slave --quiet < $R_script_pval\n");
}

sub call_R_regression {
 
use strict;

open(TMP, ">$outfile.Reg") || die "Can't open $outfile.R\n";

print TMP "library(lqs)\n";
print TMP "l2 <- log(2.0)\n";

print TMP "x <- matrix(scan(\"$tmpf1\"), ncol = $max_cells, byrow = 
T)\n";
print TMP "ncells<-dim(x)[2]\n";
print TMP "nprobes<-dim(x)[1] / 2\n";
print TMP "i1 <- 1:nprobes\n";
print TMP "i2 <- 1:nprobes\n";
print TMP "sl <- 1:nprobes\n";
print TMP "r2 <- 1:nprobes\n";
print TMP "r2t <- 1:nprobes\n";
print TMP "nt <- 1:nprobes\n";

print TMP "for (n in 1:nprobes) { \n";
print TMP "e1<-x[2*n-1,]\n";
print TMP "e2<-x[2*n,]\n";

print TMP "e1 <- e1[e1!=\"NA\"]\n";
print TMP "e2 <- e2[e2!=\"NA\"]\n";
print TMP "l1 <- length(e1)\n";
print TMP "if (l1 == 0) {\n";
print TMP "i1[n] <- NA\n";
print TMP "i2[n] <- NA\n";
print TMP "sl[n] <- NA\n";
print TMP "r2[n] <- NA\n";
print TMP "r2t[n] <- NA\n";
print TMP "nt[n] <- NA\n";
print TMP "}\n";
print TMP "else {\n";
print TMP "e1 <- log(e1)\n";
print TMP "e2 <- log(e2)\n";
print TMP "ro <- ltsreg(e1-e2 ~ +1, 
quantile=ceiling(length(e1)*$quantile))\n" if $quantile > 0;
print TMP "ro <- ltsreg(e1-e2 ~ +1)\n" if $quantile == 0;
print TMP "sl[n] <- ro\$coefficients[[1]] / l2\n";
#
#print TMP "if (sl[n] < 0) sl[n] <- min(e1-e2) / l2\n";
#print TMP "if (sl[n] > 0) sl[n] <- max(e1-e2) / l2\n";
#
print TMP "res <- ro\$residuals\n";
print TMP "ord <- order(res^2)\n";
print TMP "nn <- floor(l1/2)+1\n";
print TMP "nt[n] <- l1\n";
print TMP "i1[n] <- exp(mean(e1[ord[1:nn]]))\n";
print TMP "i2[n] <- exp(mean(e2[ord[1:nn]]))\n";
print TMP "r2[n] <- sqrt(mean(res[ord[1:nn]]^2)/l2)\n";
print TMP "r2t[n] <- sqrt(mean(res^2)/l2)\n";

print TMP "}\n";
print TMP "}\n";

print TMP "reg <- data.frame(i1,i2,sl,r2,r2t,nt)\n";
print TMP "write.table(reg, file = \"$outfile_Rout\", sep=\"\\t\", 
quote=F, row.names=F, col.names=F)\n";

close(TMP);


system("R --vsize=80M --no-save --slave --quiet < $outfile.Reg\n");
unlink("$outfile.Reg");
}

sub call_R_sgf {

  use strict;
  
  open(RDAT, ">$R_script_sgf_dat");
  print RDAT "Label\tI1\tI2\tNgood\n";
  foreach $label (keys %output) {
    unless (@{$output{$label}}[2] eq "NA") {
      print RDAT $label, "\t", @{$output{$label}}[0], "\t", 
@{$output{$label}}[1], "\t", @{$output{$label}}[5], "\n";
    }
  }
  close(RDAT);
  
  my $ng_co = 3;
  
  open(RS, ">$R_script_sgf");
  print RS "l2 <- log(2.0)\n";
  print RS "dat <- read.table(\"$R_script_sgf_dat\", sep=\"\\t\", 
header=T, as.is=c(1))\n";
  print RS "lgm0 <- ( log(dat\$I1) + log(dat\$I2) )/2\n";
  print RS "lab <- dat\$Label\n";
  print RS "dat <- subset(dat, Ngood != \"NA\" & Ngood > $ng_co)\n";
  print RS "lgm <- ( log(dat\$I1) + log(dat\$I2) )/2\n";
  print RS "lrat <- log(dat\$I1) - log(dat\$I2)\n";
  print RS "lrat <- lrat^2\n";
  print RS "library(modreg)\n";
  print RS "if ($Nloess > 0) {\n";
  print RS "  ri <- floor(runif($Nloess, 2, length(lrat)))\n";
  print RS "  ri <- c(ri, 1, length(lrat))\n";
  print RS "  lgm <- lgm[ri]\n";
  print RS "  lrat <- lrat[ri]\n";  
  print RS "  w<-rep(1, length(lrat))\n";
  print RS "}\n";
  print RS "w[1]<-0\n";
  print RS "w[length(lrat)]<-0\n";
  print RS "w<-rep(1, length(lrat))\n";
  print RS "w[0:10]<-0\n";
  print RS "w[(length(lrat)-10):length(lrat)]<-0\n";
  print RS "l <- loess(lrat ~ lgm, weights=w, span=$span, 
family=\"symmetric\", iterations=$loess_it, 
trace.hat=\"approximate\")\n";
  print RS "min <- min(lgm)\n";
  print RS "max <- max(lgm)\n";
  print RS "lgm0[lgm0 > max] <- max-0.001\n";
  print RS "lgm0[lgm0 < min] <- min+0.001\n";
  print RS "locsig <- predict(l, newdata=lgm0)\n";
  print RS "minp <- min(locsig[locsig>0])\n";
  print RS "locsig[locsig<=0] <- minp\n";
  print RS "out <- data.frame(lab, sqrt(locsig)/l2)\n";
  print RS "write.table(out, file=\"$R_script_out_sgf\", row.names=F, 
quote=F, sep=\"\\t\")\n";
  close(RS);
  system("R --vsize=128M --no-save --slave --quiet < 
$R_script_sgf\n");
}  

sub read_R_sgf{

  open (ROUT, $R_script_out_sgf);
  my @local=;
  close(ROUT);
  chomp @local;
  
  my @sgf;

  foreach (@local) {
    @sgf = split(/\t/);
    $sgf{$sgf[0]} = $sgf[1];
  }
}

sub local_scaling_R{

  use strict;
  
  open(RDAT, ">$R_local_reg_dat");
  print RDAT "Label\tI1\tI2\tl2ratio\tNgood\n";
  foreach $label (keys %output) {
    unless (@{$output{$label}}[2] eq "NA") {
      print RDAT $label, "\t", @{$output{$label}}[0], "\t", 
@{$output{$label}}[1], "\t", @{$output{$label}}[2], "\t", 
@{$output{$label}}[5],"\n";
    }
  }
  close(RDAT);
  
  my $ng_co = 3;
  my $l2r_co = 1;
  
  open(RS, ">$R_local_reg");
  print RS "dat <- read.table(\"$R_local_reg_dat\", sep=\"\\t\", 
header=T, as.is=c(1))\n";
  print RS "lgm0 <- ( log(dat\$I1) + log(dat\$I2) )/2\n";
  print RS "lab <- dat\$Label\n";
  print RS "dat <- subset(dat, l2ratio != \"NA\" & abs(l2ratio) < 
$l2r_co & Ngood > $ng_co)\n";
  print RS "lgm <- ( log(dat\$I1) + log(dat\$I2) )/2\n";
  print RS "lrat <- log(dat\$I1) - log(dat\$I2)\n";
  print RS "library(modreg)\n";
  print RS "if ($Nloess > 0) {\n";
  print RS "  ri <- floor(runif($Nloess, 2, length(lrat)))\n";
  print RS "  ri <- c(ri, 1, length(lrat))\n";
  print RS "  lgm <- lgm[ri]\n";
  print RS "  lrat <- lrat[ri]\n";  
  print RS "  w<-rep(1, length(lrat))\n";
  print RS "}\n";  
  print RS "w<-rep(1, length(lrat))\n";
  print RS "w[1]<-0\n";
  print RS "w[length(lrat)]<-0\n";
  print RS "l <- loess(lrat ~ lgm, weights=w, family=\"symmetric\", 
trace.hat=\"approximate\")\n";
  print RS "min <- min(lgm)\n";
  print RS "max <- max(lgm)\n";
  print RS "lgm0[lgm0 > max] <- max-0.001\n";
  print RS "lgm0[lgm0 < min] <- min+0.001\n";
  print RS "locscale <- predict(l, newdata=lgm0)\n";
  print RS "out <- data.frame(lab, locscale)\n";
  print RS "write.table(out, file=\"$R_out_local_reg\", row.names=F, 
quote=F, sep=\"\\t\")\n";
  close(RS);
  system("R --vsize=128M --no-save --slave --quiet < 
$R_local_reg\n");
}

sub read_local_scaling_R{

  open (ROUT, $R_out_local_reg);
  my @local=;
  close(ROUT);
  chomp @local;
  
  my @local_reg;

  foreach (@local) {
    @local_reg = split(/\t/);
    $scale_local{$local_reg[0]} = $local_reg[1];
  }
}
sub read_R_pvalues {

open (ROUT, $R_script_out_pval);
open (LABELS, $tmpf2);

my @pvalues=;
chomp @pvalues;
my @labels=;
chomp @labels;

if (scalar @pvalues != scalar @labels) {print REP "Error in the p-
values!\n"}

my $n;
for($n=0; $n;
  chomp @reg;
  my @labels=;
  chomp @labels;
  my @line;

  if (scalar @reg != scalar @labels) {print REP "Error in the 
regression!\n"}

  my $n;
  for($n=0; $n){
    if (/^Background=/) {
      $bkg=(split(/=/))[1];
    }
    if (/^Std_Dev_Background=/) {
      $sigma=(split(/=/))[1];
    }
    if (/^Saturation_Cutoff=/) {
      $sat=(split(/=/))[1];
      last;
    }
  }
  close(TMP);
  die "Could not read the background!" if not defined $bkg;
  die "Could not read sigma!"          if not defined $sigma;
  die "Could not read saturation!"     if not defined $sat;
  return ($bkg, $sigma, $sat);
}

sub scale_global () {

  use strict;
  
  my $scale = 2**$scale_global;
  
  foreach $label (keys %data) {
  
    my $n;  
    my $ncells = scalar @{$data{$label}};

    for ($n=0; $n < $ncells; $n++) {  
      $data{$label} -> [$n] -> [0] -> [0]  /= $scale;
    }
    unless ( @{$output{$label}}[2] eq "NA") {
      @{$output{$label}}[0] /= $scale;
      @{$output{$label}}[2] -= $scale_global;
    }
  }
}

sub scale_local () {

  use strict;  
  foreach $label (keys %data) {
  
    my ($n, $scale);
    
    if (exists $scale_local{$label}) {
      $scale = 2**$scale_local{$label}; 
      my $ncells = scalar @{$data{$label}};

      for ($n=0; $n < $ncells; $n++) {  
	$data{$label} -> [$n] -> [0] -> [0]  /= $scale;
      }

      unless ( @{$output{$label}}[2] eq "NA") {
	@{$output{$label}}[0] /= $scale;
	@{$output{$label}}[2] -= $scale_local{$label};
      }
    }
  }
}

sub flag {
  return "N" if $_[0] eq "NA";
  return "N" if $_[0] < $noisecut*$_[1];
  return "G" if $_[0] > $noisecut*$_[1] && $_[0] <= int($_[2]);
  return "S" if $_[0] >= int($_[2]);
}

sub perc {
  
  my @good=(); my $length=0; my $el;
  
  foreach $el (@_) {
    if ($el ne "NA"){ push @good, $el; $length++;}
  }
  if ($length==0) {return qw (NA NA NA NA NA);}
  
  
  my @b = sort {$a <=> $b} @good;
  my ($med, $first_q, $third_q);

  if ($length % 2 == 1) {
    my $mid=int($length/2);
    $med = $b[$mid] ;
  }
  else {
    my $mid=$length/2;
    $med = ($b[$mid-1] + $b[$mid]) / 2;
  }
  if ($length 0 ;
  return "NA";
}