#!/usr/bin/perl #twolout.pl $| = 1; use Getopt::Std; $right = 0; getopts('s'); while (<>) { &prefix; if (/^$/) { print $_; # printf STDERR "\ntwolout: '$token'"; next; } if (/^#r=(.+)#t=(.+)$/) { $r = $1; $total = $2; # printf STDERR "\ntwolcount: $r $total"; next; } if (/^[(]?["][<][>]["]$/) { # empty lines $right = 0; # printf STDERR "\ntwolout: '\n'"; # print "\n"; next; } if (/^[(]?["][<]([0-9]+[^ ]*)[>]["]$/) { # numbers $left = $right; $right++; $token = $1; $base = $token; $token =~ s/^[\$][\\]//; $edge = "[ = $left = $right "; $tags = "NUM NOM"; $p = sprintf "%0.1e", 1; $reading = $edge . "

= $p = [ = \"$token \" = '$base' '$base' "; $reading .= " = \"$token \" = \"$base\" " if ($opt_s); $reading .= "$tags]] "; printf $reading; # printf STDERR "\ntwolout: '$reading'"; next; } s/\\!/!/g; if (/^[(]?["][<](.)[>]["]$/) { # punctuation $left = $right; $right++; $token = $1; $base = $token; $token =~ s/^[\$][\\]//; $edge = "[ = $left = $right "; $tags = "PUNC"; $p = sprintf "%0.1e", 1; $reading = $edge . "

= $p = [ = \"$token \" = '$base' '$base' "; $reading .= " = \"$token \" = \"$base\" " if ($opt_s); $reading .= "$tags]] "; printf $reading; # printf STDERR "\ntwolout: '$reading'"; $right = 0; next; } if (/^[(]?["][<](.+)[>]["]$/) { # tokens $left = $right; $right++; $token = $1; $token =~ s/^[\$][\\]//; $edge = "[ = $left = $right "; } if (/^\t[(]?["](.+)["] /) { # readings $base = $1; $tags = $'; $base =~ s/^[\$][\\]//; chop $tags; for ($tags) { s/[ )]*$//; @array = split(/[ =-]/); $length = $#array+1; s/.* = //; } # printf STDERR "\ntwolout length: $length"; $p = (($total-$length) || 1)/((($r-1)*$total) || 1); $p = sprintf "%0.1e", $p; # printf STDERR "\ntwolout p: $p = (($total-$length) || 1)/((($r-1)*$total) || 1)\n"; $reading = $edge . "

= $p = [ = \"$token \" = '$base' '$base' "; $reading .= " = \"$token \" = \"$base\" " if ($opt_s); $reading .= "$tags]] "; &postfix; # printf STDERR "\ntwolout: '$reading'"; printf $reading; } # if ($tags =~ /PUNC/) { # $right = 0; # } $tags = ""; } # printf "\n\n"; sub prefix { s/\"seurata\" PCP1 ACT/"seuraava" A/; # note TWO spaces! s/(?<=\"mikä\" )Q/FAIL/; s/(?<=\"kiitos\" )N NOM SG/INTJ/; s/(?<=\"koska\" )SUB C/INTERR ADV/; s/(?<=\"entä\" INTERR )ADV/CF/; s/(?<=\"jälkeen\" )ADV ILL/FAIL/; } sub postfix { for ($reading) { s/(?<='minä' PERS PRON)(.*) SG/$1 SG1/; # person marking s/(?<='sinä' PERS PRON)(.*) SG/$1 SG2/; s/(?<='me' PERS PRON)(.*) PL/$1 PL1/; s/(?<='te' PERS PRON)(.*) PL/$1 PL2/; s/(?