Friday, April 18, 2008

searchpg.pl and edict2psql.pl

I've decided to make my EDICT setup a bit faster, and went with PostgreSQL to do it. Here's the population script:

edict2psql.pl

#!/usr/bin/perl -w
use strict;
use warnings;
use utf8;
use PerlIO::gzip;
use Encode;
use DBI;

scalar @ARGV || die "Usage: $0 edict\n";
my $edict = shift @ARGV;
my $db = shift @ARGV;

#!-e $db || die "Database file already exists: $db\n";


my $srch = '';
scalar @ARGV and ($srch = shift @ARGV);

open(EDICT, '<:gzip', $edict) || die "$!\n";
my $dbh = DBI->connect("dbi:Pg:dbname=jdic", "$username", "$password") || die "Unable to access jdic.\n";
# Give us a clear DB
$dbh->do("DROP TABLE words") if($dbh->selectrow_arrayref("SELECT * FROM pg_tables WHERE schemaname = 'public' AND tablename = 'words'"));
$dbh->do("DROP TABLE defns") if($dbh->selectrow_arrayref("SELECT * FROM pg_tables WHERE schemaname = 'public' AND tablename = 'defns'"));

$dbh->do("CREATE TABLE words(id SERIAL PRIMARY KEY, preferred BOOL, kanji VARCHAR(192), kana VARCHAR(192), flags VARCHAR(64), defn INTEGER)") || die "Error creating word table.\n";
$dbh->do("CREATE TABLE defns(id SERIAL PRIMARY KEY, content VARCHAR(1536))") || die "Error creating definition table.\n";
my $wcreate = $dbh->prepare("INSERT INTO words(preferred, kanji, kana, flags, defn) VALUES(?, ?, ?, ?, ?)");
#my $wkcreate = $dbh->prepare("INSERT INTO words(preferred, content, defn) VALUE(?, ?, ?)");
my $dfncreate = $dbh->prepare("INSERT INTO defns(content) VALUES(?)");
while(<EDICT>) {
$_ = encode('utf-8', decode('euc-jp', $_));
my @parts = split(/\//, $_);
s/\s+$// foreach(@parts);
my $word = shift @parts;
pop @parts if($parts[$#parts] !~ /\S/);
my $preferred = 0;
if($parts[$#parts] eq '(P)') {
$preferred = 1;
pop @parts;
}
my $defn = join('; ', @parts);
my $kana = undef;
my $flags = undef;
if($word =~ /\[([^\]]+)\]/) {
$kana = $1;
$word =~ s/\s+\[.*$//g;
}
while($defn =~ /^(\(ok\)|\(ik\))\s+(.+)$/) {
$flags and ($flags .= "$1 ") or ($flags = "$1 ");
$defn = $2;
}
my $fetchdefn = $dbh->prepare("SELECT defns.id FROM defns WHERE defns.content = ?");
$fetchdefn->execute($defn);
my $row = $fetchdefn->fetchrow_arrayref;
if($row) {
$wcreate->execute(($preferred and 'TRUE' or 'FALSE'), $word, $kana, $flags, ${$row}[0]);
}
else {
$dfncreate->execute($defn);
$fetchdefn->execute($defn);
$row = $fetchdefn->fetchrow_arrayref;
if($row) {
$wcreate->execute(($preferred and 'TRUE' or 'FALSE'), $word, $kana, $flags, ${$row}[0]);
}
}
}
close(EDICT);
print "Generation complete.\n";
$dbh->disconnect;

And, of course, the searching script, which uses PerlArg.pm:

searchpg.pl

#!/usr/bin/perl -w
use strict;
use warnings;
use utf8;
use PerlIO::gzip;
use Encode;
use DBI;
use IPC::Open2;
use PerlArg;

sub k2h {
my $ret = shift;
my $pid = open2(*OTRANS, *ITRANS, 'sed', 'y/アイウエオカキクケコガギグゲゴサシスセソ ザジズゼゾタチツテトダヂヅデドナニヌネノハヒフヘホバビブベボパピプペポマミムメモヤユヨラリルレロワンッャュョァィゥェォ/あいうえおかきくけこ がぎぐげごさしすせそざじずぜぞたちつてとだぢづでどなにぬねのはひふへほばびぶべぼぱぴぷぺぽまみむめもやゆよらりるれろわんっゃゅょぁぃぅぇぉ/');

print ITRANS $ret;
close(ITRANS);
$ret = '';
($ret .= $_) while(<OTRANS>);
close(OTRANS);
waitpid($pid, 0);
return $ret;
}
sub h2k {
my $ret = shift;
my $pid = open2(*OTRANS, *ITRANS, 'sed', 'y/あいうえおかきくけこがぎぐげごさしすせそ ざじずぜぞたちつてとだぢづでどなにぬねのはひふへほばびぶべぼぱぴぷぺぽまみむめもやゆよらりるれろわんっゃゅょぁぃぅぇぉ/アイウエオカキクケコ ガギグゲゴサシスセソザジズゼゾタチツテトダヂヅデドナニヌネノハヒフヘホバビブベボパピプペポマミムメモヤユヨラリルレロワンッャュョァィゥェォ/');

print ITRANS $ret;
close(ITRANS);
$ret = '';
($ret .= $_) while(<OTRANS>);
close(OTRANS);
waitpid($pid, 0);
return $ret;
}
my $usage = "Usage: $0 [ --mode=(japanese|english) | -j | -e ] term\n";
my $pars = PerlArg->new(Long => {mode => 'japanese'}, ShortSynoms => {j => 'mode=japanese', e => 'mode=english'}, Help => $usage);

$pars->parseArgs(@ARGV);

$pars->getAnonymousLength > 0 || die $usage;
my $mode = $pars->getValue('mode');
($mode eq 'japanese' || $mode eq 'english') || die $usage;
my $term = $pars->getAnonymous(0);


my $dbh = DBI->connect("dbi:Pg:dbname=jdic", "$username", "$password") || die "Unable to access jdic.\n";
$dbh->selectrow_arrayref("SELECT * FROM pg_tables WHERE schemaname = 'public' AND tablename = 'words'") and $dbh->selectrow_arrayref("SELECT * FROM pg_tables WHERE schemaname = 'public' AND tablename = 'defns'") || die "Tables do not exist.";

# Speed up using a cache
$dbh->do("CREATE TEMPORARY TABLE cache(kanji VARCHAR(192), kana VARCHAR(192), flags VARCHAR(64), preferred BOOL, defn INTEGER, content VARCHAR(1536))");
if($mode eq 'japanese') {
my ($hterm, $kterm) = (k2h($term), h2k($term));
my $srch = $dbh->prepare("INSERT INTO cache(kanji, kana, flags, preferred, defn, content) SELECT words.kanji, words.kana, words.flags, words.preferred, defns.id, defns.content FROM words INNER JOIN defns ON words.defn = defns.id WHERE (words.kanji ~* ? OR words.kanji ~* ? OR words.kanji ~* ?) OR (words.kana ~* ? OR words.kana ~* ? OR words.kana ~* ?)");
$srch->execute($term, $hterm, $kterm, $term, $hterm, $kterm);

$dbh->do('INSERT INTO cache(kanji, kana, flags, preferred, defn, content) SELECT words.kanji, words.kana, words.flags, words.preferred, defns.id, defns.content FROM words INNER JOIN defns ON words.defn = defns.id WHERE defns.id IN (SELECT DISTINCT ON (cache.defn) cache.defn FROM cache)');
}
elsif($mode eq 'english') {
my $srch = $dbh->prepare("INSERT INTO cache(kanji, kana, flags, preferred, defn, content) SELECT words.kanji, words.kana, words.flags, words.preferred, defns.id, defns.content FROM words INNER JOIN defns ON words.defn = defns.id WHERE bmatches(?, defns.content)");
$srch->execute($term);
}

my $srch = $dbh->prepare("SELECT DISTINCT ON (cache.defn) cache.defn, cache.content FROM cache");
$srch->execute;

my $rows = $srch->fetchall_arrayref;
my $wsearch = $dbh->prepare("SELECT DISTINCT ON (cache.kanji) cache.kanji, cache.kana, cache.flags, cache.preferred FROM cache WHERE cache.defn = ?");
if($rows) {
my $count = 0;
foreach(@{$rows}) {
my @row = @{$_};
$wsearch->execute($row[0]);
my @wrows = @{$wsearch->fetchall_arrayref};
my (@kanji, @kana);
foreach(@wrows) {
my @row = @{$_};
if($row[3]) {
unshift(@kana, $row[1] . ' (P)' . ($row[2] and " $row[2]" or '')) if($row[1]);
unshift(@kanji, $row[0] . ' (P)' . ($row[2] and " $row[2]" or ''));
}
else {
push(@kana, $row[1] . ($row[2] and " $row[2]" or '')) if($row[1]);
push(@kanji, $row[0] . ($row[2] and " $row[2]" or ''));
}
#push(@kana, $row[1] . ($row[2] != 0 and ' (P)' or '')) if($row[1]);
#push(@kanji, $row[0] . ($row[2] != 0 and ' (P)' or ''));
}
my $jkana = join('; ', @kana);
print join('; ', @kanji) . (length($jkana) > 0 and ' (' . $jkana . ')' or '') . "\n";
print "\t$row[1]\n\n";
$count += 1;
}
print "\n\nFound $count.\n" if($count);
}
$dbh->disconnect;

PL/Perl must be enabled in the database and this function defined within it in order to get word-boundary matching in English working:

CREATE FUNCTION bmatches(VARCHAR, VARCHAR) RETURNS BOOL AS $$
$_[1] =~ /\b$_[0]\b/i and return 'true' or return 'false';
$$ LANGUAGE plperl;
% perl searchpg.pl --help
Usage: searchpg.pl [ --mode=(japanese|english) | -j | -e ] term
% perl searchpg.pl -e bacon
カナディアンベーコン
(n) Canadian bacon

ショルダーベーコン
(n) shoulder bacon

ベーコン (P)
(n) bacon

ベーコンエッグ
(n) bacon (and) eggs



Found 4.
% perl searchpg.pl 仔猫
仔猫; 子猫; 小猫 (こねこ; こねこ; こねこ)
(n) kitten



Found 1.

I'll start working on KANJIDIC soon, I hope.

No comments: