Creating an automated dictionary file.

2015-01-02 13:12:01 -06:00 · 2015-01-02 13:12:01 -06:00 · 3b10eecf1f
commit 3b10eecf1f
parent 4456521536
4 changed files with 182 additions and 4 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,4 @@
 *~
+
+build/
+
--- a/bin/create-dictionary-markdown
+++ b/bin/create-dictionary-markdown
@ -0,0 +1,168 @@
+#!/usr/bin/perl
+
+#
+# Setup
+#
+
+# Directives
+use strict;
+use warnings;
+use utf8;
+
+binmode(STDOUT, ":utf8");
+binmode(STDERR, ":utf8");
+
+# Modules
+use File::Basename;
+
+#
+# Paths
+#
+
+# These scripts are designed to work within the Git repository and
+# makes assumptions of all the relative paths and outputs.
+my $REPO_DIR = dirname(dirname($0));
+my $BUILD_DIR = "$REPO_DIR/build";
+my $DICT_MARKDOWN = "$BUILD_DIR/dictionary.markdown";
+my $DICT_DIR = "$REPO_DIR/dictionary";
+
+# Make sure the build directory exists.
+unless (-d $BUILD_DIR)
+{
+	print STDERR "Creating build directory\n";
+	mkdir($BUILD_DIR);
+}
+
+#
+# Processing
+#
+
+# Create the initial Markdown file.
+open DICT, ">:encoding(UTF-8)", $DICT_MARKDOWN
+	or die "Cannot write dictionary file ($!)";
+
+# Write out the front matter.
+print DICT join(
+	"\n",
+	"---",
+	"title: Miwāfu Dictionary",
+	"---"), "\n\n";
+
+# Loop through the directories in the dictionary.
+for my $s (sort(glob("$DICT_DIR/*")))
+{
+	# Figure out the basename.
+	my $bs = basename($s);
+
+	# Determine if we have any entries in here.
+	my @w = sort(glob("$s/*.markdown"));
+	next unless @w;
+	my $w = scalar(@w);
+
+	# Write out the entry.
+	print STDERR "Processing: $bs ($w entries)\n";
+	print DICT "# $bs\n\n";
+
+	# Go through each of these entries.
+	for $w (@w)
+	{
+		process_word($w);
+	}
+}
+
+# Finish up the dictionary.
+close DICT;
+
+#
+# Finished
+#
+
+print STDERR "Done\n";
+
+#
+# Subroutines
+#
+
+sub process_word
+{
+	# Pull out the entries from the file.
+	my ($file) = @_;
+
+	# Read in this file and process the entries.
+	open WORD, "<:encoding(UTF-8)", $file;
+
+	# The format of the file matches Wikionary's format which is not
+	# how most dictionaries are created.
+	my $pos = undef;
+	my $word = undef;
+	my %defs = ();
+
+	while (<WORD>)
+	{
+		# Clean up the line.
+		chomp;
+		next if /^\s*$/;
+		next if /^=/;
+		next if /^-/;
+
+		# Figure out the parts of speech.
+		if (m@(Noun|Verb|Marker|Pronoun|Adjective|Adverb)@)
+		{
+			$pos = lc($1);
+			$pos = "adv" if $pos eq "adverb";
+			$pos = "adj" if $pos eq "adjective";
+			$pos = "pro" if $pos eq "pronoun";
+			$pos = "mark" if $pos eq "marker";
+			next;
+		}
+
+		if (m@^Related$@)
+		{
+			$pos = undef;
+		}
+
+		# If we haven't hit a POS, then skip it.
+		next unless defined $pos;
+
+		# If we have a number in the beginning, then it's a definition.
+		if (m@^(\d+)\. (.*?)$@)
+		{
+			# Make sure we have an entry here.
+			push @{$defs{$word}{$pos}}, "**$1** $2";
+			next;
+		}
+
+		# Anything else is a word to add.
+		my %def_pos = ();
+		my @def_list = ();
+
+		$word = $_;
+		$defs{$word} = \%def_pos unless defined $defs{$word};
+		$defs{$word}{$pos} = \@def_list unless defined $defs{$word}{$pos};
+	}
+
+	# Finish up the file.
+	close WORD;
+
+	# Write out the Markdown line.
+	foreach $word (sort(keys(%defs)))
+	{
+		# Start by formatting the word.
+		my $buffer = "**$word**:";
+
+		# Add in the parts of speech.
+		for $pos (qw(noun verb adj adv pro mark))
+		{
+			# If we don't have one, skip it.
+			next unless exists $defs{$word}{$pos};
+
+			# Add in the POS.
+			$buffer .= " *$pos* ";
+
+			# Go through the definitions.
+			$buffer .= join(" ", @{$defs{$word}{$pos}});
+		}
+
+		print DICT "$buffer\n\n";
+	}
+}
--- a/dictionary/he/heru.markdown
+++ b/dictionary/he/heru.markdown
@ -8,4 +8,14 @@ Noun
 hèru
 ----------------

-1. Stallion, horse.
+1. A stallion or male horse.
+
+hēru
+----------------
+
+1. A foal or young horse.
+
+héru
+----------------
+
+1. A mare or female horse.
--- a/dictionary/ro/rocho.markdown
+++ b/dictionary/ro/rocho.markdown
@ -30,6 +30,3 @@ rócho

 1. To have a passive-aggressive fight.
 2. To fight without revealing the purpose of the fight.
-
-
-ròcho: (v) 2. To fight for the sake of fighting.