#!/usr/bin/perl -w # vim: nowrap:noet:ts=2:ai:si:tw=0:noic:nohls:nobk:sw=2:ft=perl # [c] piotao, 20080120, 4gentoo, @venona # $Id$ # simple help is available after invoking: # perldoc piotao.pl # (this works if you are have perldoc installed) use strict; use locale; use utf8; use Getopt::Long; use Data::Dumper; use 5.8.0; # simple configuration hash for program options and internal data my %CFG = ( filetype => qr/\.xml$/io, # extension regexp for scanned files verbose => 0, # verbose mode fix => 0, # auto-repair of typos (can be dangerous and inaccurate) eregs => &makeEregs, # inline set of polish typos files_count => 0, dirs_count => 0, ); # let's parse CLI options GetOptions( 'verbose|v!' => \$CFG{verbose}, # --verbose, -v (switch option) 'fix|f!' => \$CFG{fix}, # --fix, -f (switch option) ); # make all regexps as one but big one for quick testing $CFG{alleregs} = join '|',map{ $CFG{eregs}{$_}{ereg} } keys %{$CFG{eregs}}; $CFG{alleregs} = qr/$CFG{alleregs}/msio; # do all job in each dir given in CLI foreach my $dir ( @ARGV ? @ARGV : '.' ) { print "Dir: $dir\n" if $CFG{verbose}; scanDir($dir); } print "Finished in ".(time - $^T)."s.\n" if $CFG{verbose}; print "Scanned ".(scalar @ARGV)." main locations, $CFG{dirs_count} dirs, $CFG{files_count} files\n" if $CFG{verbose}; exit; ########################################### # main sub for single file testing sub scanFile { my $path = shift; my $name = shift; my $filename = "$path/$name"; my $linenr = 1; my $errprint = 0; if( open my $file, $filename ){ $CFG{files_count}++; local $/ = undef; my $content = <$file>; # print $name, "\n" if $CFG{verbose}; study $content; if($content =~ $CFG{alleregs}){ # if any typos is matching foreach my $line (split /[\n\r]/,$content){ # look for it with each ereg foreach my $ereg (keys %{$CFG{eregs}}){ if($line =~ /$CFG{eregs}{$ereg}{ereg}/){ if(!$errprint){ print "file: $name\n"; $errprint++; } print " $linenr: $ereg\n"; } } $linenr++; } } close $file; } else{ warn "Cannot open file: '$filename' $!\n"; } } # our own implementation of recursive directory scan # we don't rely on File::Find (could be missing) sub scanDir { (my $path = shift) =~ s/\/+$//; my $dir; if( opendir $dir, $path ){ $CFG{dirs_count}++; while( my $item = readdir $dir ){ next if $item eq '.' or $item eq '..'; next if -l $item; # avoid symlinks if( -d "$path/$item" ){ # if this is directory print "dir: $item\n" if $CFG{verbose}; scanDir("$path/$item"); } elsif( -T "$path/$item" and "$item" =~ /$CFG{filetype}/o ){ scanFile($path,$item); } } closedir $dir; } } # let's make all of those terrible typos sub makeEregs { # a new patters consists of a simple name within '' # and then of two fields: ereg and fix. # # ereg = a real regular expression for matching typo text # fix = a real text for valid and legal replacement # return { # '' => { # ereg => qr//somi, # fix => '', # }, 'wg.' => { ereg => qr/\bwg\./somi, fix => 'wg', }, 'nr.' => { ereg => qr/\bnr\./somi, fix => 'nr', }, 'mgr.' => { ereg => qr/\bmgr\./somi, fix => 'mgr', }, 'dr.' => { ereg => qr/\bdr\./somi, fix => 'dr', }, 'v-ce mistrz' => { ereg => qr/\bv-ce\s+mistrz/somi, fix => 'wicemistrz', }, 'v-ce minister' => { ereg => qr/\bv-ce\s+minister/somi, fix => 'wiceminister', }, 'm/in.' => { ereg => qr/\bm\s*\/\s*in\./somi, fix => 'm.in.', }, 'm. in.' => { ereg => qr/\bm\.\s+in\./somi, fix => 'm.in.', }, 'np:' => { ereg => qr/\bnp:/somi, fix => 'np.', }, 'wogóle|wogule|wogle' => { ereg => qr/\b(?:wogóle|wogule|wogle|w\s*og[uo]le)\b/somi, fix => 'w ogóle', }, 'w/w|w.w.' => { ereg => qr/\bw\/w\b|\bw\.w\./somi, fix => 'ww.', }, 'wgłąb' => { ereg => qr/\bw\s+głąb/somi, fix => 'w głąb', }, 'z przed' => { ereg => qr/\b[sz]\s+przed/somi, fix => 'sprzed', }, 'spowrotem' => { ereg => qr/\b[zs]powrotem\b/somi, fix => 'z powrotem', }, 'mimo,że' => { ereg => qr/\bmimo,że\b/somi, fix => 'mimo, że', }, 'mimo,iż' => { ereg => qr/\bmimo,iż\b/somi, fix => 'mimo, iż', }, 'dlatego bo' => { ereg => qr/dlatego,?\s+bo\b/somi, fix => 'dlatego że', }, 'pomimo,że' => { ereg => qr/\bpomimo,że\b/somi, fix => 'pomimo, że', }, 'pomimo,iż' => { ereg => qr/\bpomimo,iż\b/somi, fix => 'pomimo, iż', }, 'po,' => { ereg => qr/\bpo,/somi, fix => 'po ', }, 'możnaby' => { ereg => qr/\bmożnaby\b/somi, fix => 'można by', }, 'mógł by' => { ereg => qr/\bmógł\s+by\b/somi, fix => 'mógłby', }, 'mogła by' => { ereg => qr/\bmogła\s+by\b/somi, fix => 'mogłaby', }, 'mogło by' => { ereg => qr/\bmogło\s+by\b/somi, fix => 'mogłoby', }, 'zrobił bym' => { ereg => qr/\bzrobił\s+bym/somi, fix => 'zrobiłbym', }, 'język Polski' => { ereg => qr/\bjęzyk\s+Polski/somi, fix => 'język polski', }, 'pare' => { ereg => qr/\bpare\b/somi, fix => 'parę', }, 'imie' => { ereg => qr/\bimie\b/somi, fix => 'imię', }, 'umię' => { ereg => qr/\bumię\b/somi, fix => 'umiem', }, 'rozumię' => { ereg => qr/\brozumię\b/somi, fix => 'rozumiem', }, 'prosze' => { ereg => qr/\bprosze\b/somi, fix => 'proszę', }, 'koleji' => { ereg => qr/\bkoleji/somi, fix => 'kolei', }, 'pierszy' => { ereg => qr/\bpierszy\b/somi, fix => 'pierwszy', }, 'piersi' => { ereg => qr/\bpiersi\b/somi, fix => 'pierwsi', }, 'czeci' => { ereg => qr/\bczeci\b/somi, fix => 'trzeci', }, 'trzcionka' => { ereg => qr/\btrzcionka\b/somi, fix => 'czcionka', }, 'ludzią' => { ereg => qr/\bludzią\b/somi, fix => 'ludziom', }, 'napewno' => { ereg => qr/\bnapewno\b/somi, fix => 'na pewno', }, 'poprostu' => { ereg => qr/\bpoprostu\b/somi, fix => 'po prostu', }, 'conajmniej' => { ereg => qr/\bconajmniej\b/somi, fix => 'co najmniej', }, 'na codzień' => { ereg => qr/\bna\s+codzień/somi, fix => 'na co dzień', }, 'naco dzień' => { ereg => qr/\bnaco\s+dzień\b/somi, fix => 'na co dzień', }, 'na prawdę' => { ereg => qr/\bna\s+prawdę\b/somi, fix => 'naprawdę', }, 'na przeciwko' => { ereg => qr/\bna\s+przeciwko/somi, fix => 'naprzeciwko', }, 'wszechczasów' => { ereg => qr/\bwszechczasów/somi, fix => 'wszech czasów', }, 'na raz' => { ereg => qr/\bna\s+raz/somi, fix => 'naraz', }, ',którego' => { ereg => qr/,którego/somi, fix => ', którego', }, # braces are excluded from checking # '( ' => { # ereg => qr/\(\s+/somi, # fix => '(', # }, # ' )' => { # ereg => qr/\s+\)/somi, # fix => ')', # }, # so is colon # ' :' => { # ereg => qr/\s+:/somi, # fix => ':', # }, 'HTMLa' => { ereg => qr/\bHTMLa\b/somi, fix => 'HTML-a', }, 'Linuxie' => { ereg => qr/\bLinuxie\b/somi, fix => 'Linuksie', }, 'Linux\'ie' => { ereg => qr/\bLinux["']ie\b/somi, fix => 'Linuksie', }, 'Firefoxie' => { ereg => qr/\bFirefoxie\b/somi, fix => 'Firefoksie', }, 'Firefox\'ie' => { ereg => qr/\bFirefox['"]ie\b/somi, fix => 'Firefoksie', }, 'Windows\'ie' => { ereg => qr/\bWindows['"]ie\b/somi, fix => 'Windowsie', }, 'dyńcz' => { ereg => qr/dyńcz\b/somi, fix => 'dyncz', }, 'nielada' => { ereg => qr/\bnielada\b/somi, fix => 'nie lada', }, 'niebyle' => { ereg => qr/\bniebyle\b/somi, fix => 'nie byle', }, 'menager' => { ereg => qr/\bmenager\b/somi, fix => 'manager', }, # '' => { # ereg => qr/\b\b/somi, # fix => '', # }, } } __END__ =pod =head1 NAME Program for hunting typos and other mistakes in written text. =head2 AUTHOR Piotr Arłukowicz, University of Gdańsk, IMFiI Division of Computer Science Chair of Artificial Inteligence =head2 SYNOPSIS script.pl /absolute/path/to/xml/files script.pl ../relative/path script.pl /multiple /paths /to /many /dirs script.pl --fix /path perldoc script.pl =head2 NO OPTIONS script.pl script.pl `pwd` When this script is invoked without options it starts working in search mode for current directory (C<.>). The above examples are identical. perldoc script.pl This invocation is not really a program invocation, but a perl native help system, called C. This causes perldoc to parse script as a text file and to make all POD found inside visible as man page, which is displayed by your system pager. =head2 OPTIONS --fix, -f Enables auto-repair mode. Script performs usual tests, but also can modify files in-place, replacing matching typos. Due to regexp nature and possible wide matches this can be dangerous. =head2 NOTES The fix option (C<--fix>) is not yet implemented. Consider this as more safe, really. =head2 BUGS Sometimes program can match a typo which is a substring of a longer sentence. This is caused by inaccurate regexp patterns and can be avoided by the user. Just try to think when adding typos database.