203 lines
3.9 KiB
Perl
Executable file
203 lines
3.9 KiB
Perl
Executable file
#!/usr/bin/perl
|
|
|
|
#
|
|
# Setup
|
|
#
|
|
|
|
# Directives
|
|
use strict;
|
|
use warnings;
|
|
use utf8;
|
|
|
|
binmode(STDOUT, ":utf8");
|
|
binmode(STDERR, ":utf8");
|
|
|
|
# Modules
|
|
use File::Basename;
|
|
use Getopt::Long;
|
|
|
|
# Variables
|
|
my $EXT = ".markdown";
|
|
my $REPO_DIR = dirname($0) . "/..";
|
|
my $BUILD_DIR = "$REPO_DIR/build";
|
|
|
|
&GetOptions(
|
|
"output|o=s" => \$BUILD_DIR,
|
|
"ext|e=s" => \$EXT,
|
|
);
|
|
|
|
#
|
|
# Paths
|
|
#
|
|
|
|
# These scripts are designed to work within the Git repository and
|
|
# makes assumptions of all the relative paths and outputs.
|
|
my $DICT_MARKDOWN = "$BUILD_DIR/index.markdown";
|
|
my $DICT_DIR = "$REPO_DIR/src/dictionary";
|
|
|
|
# Make sure the build directory exists.
|
|
unless (-d $BUILD_DIR)
|
|
{
|
|
print STDERR "Creating build directory\n";
|
|
system("mkdir", "-p", $BUILD_DIR);
|
|
}
|
|
|
|
#
|
|
# Processing
|
|
#
|
|
|
|
# Create the initial Markdown file.
|
|
my ($DICT, $SYB);
|
|
print STDERR "Writing to $DICT_MARKDOWN\n";
|
|
open $DICT, ">:encoding(UTF-8)", $DICT_MARKDOWN
|
|
or die "Cannot write dictionary file ($!)";
|
|
|
|
# Write out the front matter.
|
|
print $DICT join(
|
|
"\n",
|
|
"---",
|
|
"title: Miwāfu Dictionary",
|
|
"breadcrumbTitle: Dictionary",
|
|
"---"), "\n\n";
|
|
|
|
# Loop through the directories in the dictionary.
|
|
for my $s (sort(glob("$DICT_DIR/*")))
|
|
{
|
|
# Figure out the basename.
|
|
my $bs = basename($s);
|
|
|
|
# Open up the syllable file.
|
|
open $SYB, ">:encoding(UTF-8)", "$BUILD_DIR/$bs.markdown"
|
|
or die "Cannot write $bs dictionary file ($!)";
|
|
|
|
# Determine if we have any entries in here.
|
|
my @w = sort(glob("$s/*.markdown"));
|
|
next unless @w;
|
|
my $w = scalar(@w);
|
|
|
|
# Write out the entry.
|
|
print STDERR "Processing: $bs ($w entries)\n";
|
|
print $DICT "# $bs\n\n";
|
|
|
|
print $SYB "---\ntitle: Miwāfu Dictionary - $bs\nbreadcrumbTitle: $bs\n---\n\n# $bs\n\n";
|
|
|
|
# Go through each of these entries.
|
|
for $w (@w)
|
|
{
|
|
process_word($w, $bs);
|
|
}
|
|
|
|
# Close the file.
|
|
close $SYB;
|
|
}
|
|
|
|
# Finish up the dictionary.
|
|
close $DICT;
|
|
|
|
#
|
|
# Finished
|
|
#
|
|
|
|
print STDERR "Done\n";
|
|
|
|
#
|
|
# Subroutines
|
|
#
|
|
|
|
sub process_word
|
|
{
|
|
# Pull out the entries from the file.
|
|
my ($file, $bs) = @_;
|
|
|
|
# Read in this file and process the entries.
|
|
open WORD, "<:encoding(UTF-8)", $file;
|
|
|
|
# The format of the file matches Wikionary's format which is not
|
|
# how most dictionaries are created.
|
|
my $pos = undef;
|
|
my $word = undef;
|
|
my %defs = ();
|
|
|
|
while (<WORD>)
|
|
{
|
|
# Clean up the line.
|
|
chomp;
|
|
next if /^\s*$/;
|
|
next if /^=/;
|
|
next if /^-/;
|
|
|
|
# Figure out the parts of speech.
|
|
if (m@(Noun|Verb|Marker|Pronoun|Adjective|Adverb)@)
|
|
{
|
|
$pos = lc($1);
|
|
$pos = "adv" if $pos eq "adverb";
|
|
$pos = "adj" if $pos eq "adjective";
|
|
$pos = "pro" if $pos eq "pronoun";
|
|
$pos = "mark" if $pos eq "marker";
|
|
next;
|
|
}
|
|
|
|
if (m@^Related$@)
|
|
{
|
|
$pos = undef;
|
|
}
|
|
|
|
# If we haven't hit a POS, then skip it.
|
|
next unless defined $pos;
|
|
|
|
# If we have a number in the beginning, then it's a definition.
|
|
if (m@^(\d+)\. (.*?)$@)
|
|
{
|
|
# Make sure we have an entry here.
|
|
push @{$defs{$word}{$pos}}, "**$1** $2";
|
|
next;
|
|
}
|
|
|
|
# Anything else is a word to add.
|
|
my %def_pos = ();
|
|
my @def_list = ();
|
|
|
|
s@^\#+\s+@@;
|
|
$word = $_;
|
|
$defs{$word} = \%def_pos unless defined $defs{$word};
|
|
$defs{$word}{$pos} = \@def_list unless defined $defs{$word}{$pos};
|
|
}
|
|
|
|
# Finish up the file.
|
|
close WORD;
|
|
|
|
# Write out the Markdown line.
|
|
foreach $word (sort(keys(%defs)))
|
|
{
|
|
# Start by formatting the word.
|
|
my $slug = $word;
|
|
|
|
$slug =~ s/[áàā]/a/g;
|
|
$slug =~ s/[éèē]/e/g;
|
|
$slug =~ s/[íìī]/i/g;
|
|
$slug =~ s/[óòō]/o/g;
|
|
$slug =~ s/[úùū]/u/g;
|
|
$slug =~ s/[\s-]+/-/g;
|
|
|
|
print $DICT "[$word]($bs/$slug$EXT):";
|
|
print $SYB "[$word]($slug$EXT):";
|
|
|
|
my $buffer = "";
|
|
|
|
# Add in the parts of speech.
|
|
for $pos (qw(noun verb adj adv pro mark))
|
|
{
|
|
# If we don't have one, skip it.
|
|
next unless exists $defs{$word}{$pos};
|
|
|
|
# Add in the POS.
|
|
$buffer .= " *$pos* ";
|
|
|
|
# Go through the definitions.
|
|
$buffer .= join(" ", @{$defs{$word}{$pos}});
|
|
}
|
|
|
|
print $DICT "$buffer\n\n";
|
|
print $SYB "$buffer\n\n";
|
|
}
|
|
}
|