#!/usr/bin/perl # # Setup # # Directives use strict; use warnings; use utf8; binmode(STDOUT, ":utf8"); binmode(STDERR, ":utf8"); # Modules use File::Basename; use Getopt::Long; # Variables my $EXT = ".markdown"; my $REPO_DIR = dirname($0) . "/.."; my $BUILD_DIR = "$REPO_DIR/build"; &GetOptions( "output|o=s" => \$BUILD_DIR, "ext|e=s" => \$EXT, ); # # Paths # # These scripts are designed to work within the Git repository and # makes assumptions of all the relative paths and outputs. my $DICT_MARKDOWN = "$BUILD_DIR/index.markdown"; my $DICT_DIR = "$REPO_DIR/src/dictionary"; # Make sure the build directory exists. unless (-d $BUILD_DIR) { print STDERR "Creating build directory\n"; system("mkdir", "-p", $BUILD_DIR); } # # Processing # # Create the initial Markdown file. my ($DICT, $SYB); print STDERR "Writing to $DICT_MARKDOWN\n"; open $DICT, ">:encoding(UTF-8)", $DICT_MARKDOWN or die "Cannot write dictionary file ($!)"; # Write out the front matter. print $DICT join( "\n", "---", "title: Miwāfu Dictionary", "breadcrumbTitle: Dictionary", "---"), "\n\n"; # Loop through the directories in the dictionary. for my $s (sort(glob("$DICT_DIR/*"))) { # Figure out the basename. my $bs = basename($s); # Open up the syllable file. open $SYB, ">:encoding(UTF-8)", "$BUILD_DIR/$bs.markdown" or die "Cannot write $bs dictionary file ($!)"; # Determine if we have any entries in here. my @w = sort(glob("$s/*.markdown")); next unless @w; my $w = scalar(@w); # Write out the entry. print STDERR "Processing: $bs ($w entries)\n"; print $DICT "# $bs\n\n"; print $SYB "---\ntitle: Miwāfu Dictionary - $bs\nbreadcrumbTitle: $bs\n---\n\n# $bs\n\n"; # Go through each of these entries. for $w (@w) { process_word($w, $bs); } # Close the file. close $SYB; } # Finish up the dictionary. close $DICT; # # Finished # print STDERR "Done\n"; # # Subroutines # sub process_word { # Pull out the entries from the file. my ($file, $bs) = @_; # Read in this file and process the entries. open WORD, "<:encoding(UTF-8)", $file; # The format of the file matches Wikionary's format which is not # how most dictionaries are created. my $pos = undef; my $word = undef; my %defs = (); while () { # Clean up the line. chomp; next if /^\s*$/; next if /^=/; next if /^-/; # Figure out the parts of speech. if (m@(Noun|Verb|Marker|Pronoun|Adjective|Adverb)@) { $pos = lc($1); $pos = "adv" if $pos eq "adverb"; $pos = "adj" if $pos eq "adjective"; $pos = "pro" if $pos eq "pronoun"; $pos = "mark" if $pos eq "marker"; next; } if (m@^Related$@) { $pos = undef; } # If we haven't hit a POS, then skip it. next unless defined $pos; # If we have a number in the beginning, then it's a definition. if (m@^(\d+)\. (.*?)$@) { # Make sure we have an entry here. push @{$defs{$word}{$pos}}, "**$1** $2"; next; } # Anything else is a word to add. my %def_pos = (); my @def_list = (); s@^\#+\s+@@; $word = $_; $defs{$word} = \%def_pos unless defined $defs{$word}; $defs{$word}{$pos} = \@def_list unless defined $defs{$word}{$pos}; } # Finish up the file. close WORD; # Write out the Markdown line. foreach $word (sort(keys(%defs))) { # Start by formatting the word. my $slug = $word; $slug =~ s/[áàā]/a/g; $slug =~ s/[éèē]/e/g; $slug =~ s/[íìī]/i/g; $slug =~ s/[óòō]/o/g; $slug =~ s/[úùū]/u/g; $slug =~ s/[\s-]+/-/g; print $DICT "[$word]($bs/$slug$EXT):"; print $SYB "[$word]($slug$EXT):"; my $buffer = ""; # Add in the parts of speech. for $pos (qw(noun verb adj adv pro mark)) { # If we don't have one, skip it. next unless exists $defs{$word}{$pos}; # Add in the POS. $buffer .= " *$pos* "; # Go through the definitions. $buffer .= join(" ", @{$defs{$word}{$pos}}); } print $DICT "$buffer\n\n"; print $SYB "$buffer\n\n"; } }