#!/bin/tcsh # NOTE: this lab must be run on a LINUX MACHINE # # -- one of the 'e' or 'x' machines -- # # 'e' machines include # e02 through e16 # # the faster 'x' machines (preferred) include # x01 through x65 # ----- STEP (1) # # copy the lab files to your home directory ~cschafer/i.mtrun/exec/do.cp # ----- STEP (2) # # add demo utilities directory to your path tcsh setenv PATH /export/mt/pipeline/bin:$PATH rehash # ----- STEP (3) # # go to one of the corpus directories (e.g. french) ; also there are tetun , swedish, and archaic english cd ~/mtrun/corpora/french/ # ----- STEP (4) # # you'll see the following files: # # bible.english - full corpus in english # bible.foreign - full corpus in french # # bible.train.english - english training subset of corpus # bible.train.foreign - french training subset of corpus # # bible.eval.english - english held-out subset of corpus # bible.eval.foreign - french held-out subset of corpus # # now you need to train the MT system on a parallel corpus, but you # may (at least initially) want to do this using a subset of the training # corpora provided, in order to speed up training. we suggest initially # using a 1000-line parallel corpus. generate the following small training corpus # in your ~mtrun/corpora/french/run/ directory as follows: # (note that the input to giza++ training is a parallel text represented as 2 files, # *.e and *.f [specific file name suffixes are important] with one sentence per line, # where corresponding lines between the 2 files are translations of each other). cd ~/mtrun/corpora/french/ mkdir run head -1000 bible.train.english > run/bible.e head -1000 bible.train.foreign > run/bible.f # ----- STEP (5) # # NOTE: if you want to create your own training bitext from a sentence-aligned parallel corpus, # just run ~/mtrun/tools/delims.pl < corpus.{e,f} > corpus.train.{e,f} # ...this script adds sentence delimiter markers which are important to the language model and decoder # # NOTE: ~/mtrun/tools/lc.pl < in > out # will produce a lowercased version of your corpus which you could use for training # ----- # so now let's train the translation models as follows # .. go to the directory containing your training data # # cd ~/mtrun/corpora/french/run/ # ----- STEP (6) # # .. run giza++ training from this directory # 2.runGIZA++.pl bible.e bible.f linux # ----- STEP (7) # # we want to decode some of the held-out french sentences now. # first, we need an english language model. # we will construct the language model (LM) from the training portion of the english bible. # the following script will extract a list of vocabulary items from the corpus # (needed for building the language model). cd ~/mtrun/corpora/french/ ~/mtrun/tools/voc.pl < bible.english > ~/mtrun/lm/english.bible.1.voc # ----- STEP (8) # # (building the language model) # # ------------------------------------------------------------------------- # NOTE: the language modeling toolkit only runs on SUN MACHINES # # please ssh into a sun machine to run the following commands. # you can do this on one of the 's' machines (your local desktop computer # will work for this). tcsh setenv PATH /export/mt/pipeline/bin:$PATH rehash cd ~/mtrun/corpora/french/ perl /export/mt/pipeline/bin/runCMUToolkit.pl bible.train.english -d ~/mtrun/lm/ -v ~/mtrun/lm/english.bible.1.voc # ALSO NOTE: if you want to skip building your own language model for this exercise you can substitute # one of the existing language models which are already compiled: # # ~/mtrun/lm/existing.french_english/bible.train.english.binlm # ~/mtrun/lm/existing.archaic_english/bible.train.english.binlm # ~/mtrun/lm/existing.tetun_english/tetun.train.english.binlm # ~/mtrun/lm/existing.swedish_english/bible.train.english.binlm # ------------------------------------------------------------------------- # ----- STEP (9) # # before running the decoder we need to modify its config file. # do the following: cd ~/mtrun/corpora/french/ mkdir decode cd decode cp ~/mtrun/files/isi-decoder.config . # the config file's top 2 lines must be edited: for this example they should be modified to: # # LanguageModelFile = YOUR_HOME_DIR/mtrun/lm/bible.train.english.binlm # TranslationModelConfigFile = YOUR_HOME_DIR/mtrun/corpora/french/run/linux/tmconfig.cfg # # NOTE that you can't use ~ in paths in this config file. # # after performing this edit, # now we can decode some sentences. # the held-out french sentences are in # ~/mtrun/corpora/french/bible.eval.foreign # have a look at this file, and pick a sentence to decode into english. # at least initially, try a short sentence of 10-15 words; longer # sentences may take a while to decode. isi-decoder.linux --config isi-decoder.config # # NOTE - you can either run the decoder via the above command line and # type/paste french sentences in, or you can give it a file # on standard input # ----- STEP (10) --- visualizing word alignments for the training data # cd ~/mtrun/corpora/french/run/linux/ ~/mtrun/tools/selsent.pl results.A3.final 2 | ~/mtrun/tools/giza2align.pl | ~/mtrun/tools/align2fig.pl | fig2dev -Lps | gv - # this visualizes alignments for the second sentence in the training data. # selsent.pl takes a first argument (the file containing alignments) and a # second argument (the number of the sentence to be visualized, counting from 1). # NOTE: avoid long sentences (longer than 46 words in either language).