#!/usr/local/bin/perl # repair plain corrupt # Takes two files of the same size. Learns a bigram model of the # plain file, and a substitution model that converts the plain file # into the corrupt file (line by line). # Then, attempts to "repair" corrupt line by line. # Outputs success stats to stdout. # MAIN: get the file names, collect stats, repair { $plain = $ARGV[0]; $corrupt = $ARGV[1]; makemodels($plain, $corrupt); repair($plain, $corrupt); } # Collects data, line by line. sub makemodels { local($plain,$corrupt) = @_; open(PL, "<$plain"); open(CR, "<$corrupt"); while () { $plln = $_; $crln = ; # print "PL: $plln\n"; # print "CR: $crln\n"; modelline($plln, $crln); } close(PL); close(CR); } # Collect counts for the give line # $unitotal: total letters in the file # $unigram{$plainchar}: number of times plainchar appears in plain. # $obs{$plainchar,$corruptchar}: number of times plainchar is # corrupted into corruptchar # $bigram{$plainchar1,$plainchar2}: number of times the two # plainchars appear together in the given order. # \n in the beginning of line marker, \r is the end of line marker sub modelline { local($plln,$crln) = @_; # characters from the lines local($i, $len); local($plch, $crch); # previous character local($prch) = "\n"; $unigram{$prch}++; $unitotal++; $len = length($plln); if (length($crln) != $len) { print STDERR "PLAIN: `$plln' does not match CORRUPT: `$crln'\n"; exit; } for ($i = 0; $i < $len; $i++) { $plch = substr($plln,$i,1); $crch = substr($crln,$i,1); $bigram{$prch, $plch}++; $obs{$plch, $crch}++; $unigram{$plch}++; $unitotal++; $prch = $plch; } # end marker $plch = "\r"; $bigram{$prch, $plch}++; } # Try to fix the given corrupted file. Uses the plain file to see how # well it did. sub repair { local($plain,$corrupt) = @_; local($rpln); local($right1, $wrong1, $r1, $w1, $right2, $wrong2, $r2, $w2); $right1 = 0; $wrong1 = 0; $right2 = 0; $wrong2 = 0; open(PL, "<$plain"); open(CR, "<$corrupt"); while () { chop; $plln = $_; $crln = ; chop $crln; # print "PL: $plln\n"; # print "CR: $crln\n"; $rpln = repairline($crln); print "$rpln\n"; ($r1, $w1, $r2, $w2) = compareline($plln, $crln, $rpln); $right1 += $r1; $wrong1 += $w1; $right2 += $r2; $wrong2 += $w2; } print STDERR "RIGHT1: $right1, WRONG1: $wrong1\n"; print STDERR "RIGHT2: $right2, WRONG2: $wrong2\n"; printf (STDERR "%f%% correct.\n", ($right1+$right2)/($right1+$right2+$wrong1+$wrong2)); close(PL); close(CR); } # Given a corrupted line, returns a repaired line. # (This is the one you need to change.) sub repairline { local($crln) = @_; local($i, $len, $j, $prevlist, $curlist); local($plch, $crch, $prch); # previous character local($rpln); local($best, $bestscore, $score, %prob, %back); $prch = "\n"; $rpln = ""; $len = length($crln); # print "$len #\n"; # Possible symbols on previous step $prevlist = "\n"; $prob{"\n", -1} = 1.0; # print "{$crln}\n"; # try all positions for ($i = 0; $i < $len; $i++) { $crch = substr($crln,$i,1); # print "($crch), $prevlist\n"; # try all current characters $curlist = ""; foreach $plch (keys %unigram) { $best = "DUMMY"; $bestscore = 0.0; # print "[$plch], $prevlist\n"; # try all previous characters for ($j = 0; $j < length($prevlist); $j++) { $prch = substr($prevlist,$j,1); $score = $prob{$prch, $i-1} * $bigram{$prch, $plch} / $unigram{$prch} * $obs{$plch, $crch} / $unigram{$plch}; # print "<$plch, $prch, $score>\n"; if ($score > $bestscore) { $bestscore = $score; $best = $prch; } } if ($bestscore > 0.0) { $curlist = "$curlist$plch"; $prob{$plch, $i} = $bestscore; $back{$plch, $i} = $best; # print "{$plch}, {$best}, {$bestscore}\n"; } } # print "->$curlist\n"; $prevlist = $curlist; } # Done computing prob and back, build the string. $rpln = ""; # print "=>$prevlist\n"; $best = "DUMMY"; $bestscore = 0.0; for ($j = 0; $j < length($prevlist); $j++) { $plch = substr($prevlist,$j,1); if ($prob{$plch, $len-1} > $bestscore) { $bestscore = $prob{$plch, $len-1}; $best = $plch; } } # print "=$best\n"; $i = $len-1; while ($i >= 0) { $rpln = "$best$rpln"; $best = $back{$best, $i}; $i--; } return ($rpln); } # Given a plain line, a corrupted line, and a repaired line, compare # them to get statistics. sub compareline { local($plln, $crln, $rpln) = @_; local($i, $len); local($r1, $w1) = (0,0); local($r2, $w2) = (0,0); local($plch, $crch, $rpch); $len = length($plln); if (length($crln) != $len) { print STDERR "PLAIN: `$plln' does not match CORRUPT: `$crln'\n"; exit; } if (length($rpln) != $len) { print STDERR "PLAIN: `$plln' does not match REPAIR: `$rpln'\n"; exit; } for ($i = 0; $i < $len; $i++) { $plch = substr($plln,$i,1); $crch = substr($crln,$i,1); $rpch = substr($rpln,$i,1); # print "[$plch$crch$rpch]"; if ($plch eq $crch) { if ($plch ne $rpch) { $w1++; } else { $r1++; } } else { if ($plch ne $rpch) { # wrong $w2++; } else { # right $r2++; } } } # print "\n"; return ($r1, $w1, $r2, $w2); }