[developers] Re: Morphological ambiguity and xfst-lkb interface
Emily M. Bender
ebender at u.washington.edu
Mon Apr 11 03:30:49 CEST 2005
Hi again,
Responding to my own post about interface to a morphological
analyzer. I got it running with some redefinitions of
add-morphs-to-morphs and add-words-to-chart. I'm attaching
the user-fns file with these functions in case they're helpful
to anyone else. The main thing I had to work around was
the assumption that there is just one form for each chart
position (stored in an array called *morphs*). The modification
to add-words-to-chart was required to treat *morphs* instead
as an array of lists (of forms for each position, passed
in my the morphological analyzer).
Presumably some change along these lines will be required
in a general solution to interfacing with external morphological
analyzers, but I hesitate to try to incorporate it right now,
not having time to try to consider what other cases might be
involved, etc.
I'm curious what :cto and :cfrom (properties of morph-edge
and chart-edge) are, especially since chart-edge also has
:from :to. I couldn't find any enlightening comments...
Emily
-------------- next part --------------
(in-package :lkb)
;;;
;;; identify characters that can form words; all other characters will create
;;; word boundaries and later be suppressed in tokenization.
;;;
(defun alphanumeric-or-extended-p (c)
(and (graphic-char-p c) (not (member c *punctuation-characters*))))
;;;
;;; determine surface order of constituents in rule: returns list of paths into
;;; feature structure of rule, i.e. (nil (args first) (args rest first)) for a
;;; binary rule, where the first list element is the path to the mother node of
;;; the rule.
;;;
(defun establish-linear-precedence (rule)
(let ((daughters
(loop
for args = (existing-dag-at-end-of rule '(args))
then (existing-dag-at-end-of args *list-tail*)
for daughter = (when args
(get-value-at-end-of args *list-head*))
for path = (list 'args) then (append path *list-tail*)
while (and daughter (not (eq daughter 'no-way-through)))
collect (append path *list-head*))))
(if (null daughters)
(cerror "Ignore it" "Rule without daughters")
(cons nil daughters))))
;;;
;;; detect rules that have orthographemic variation associated to them; those
;;; who do should only be applied within the morphology system; for the time
;;; being use value of NEEDS-AFFIX feature, though it would be nicer to rely
;;; on a type distinction of lexical rules or re-entrancy of ORTH.
;;;
(defun spelling-change-rule-p (rule)
(let ((affix (get-dag-value (tdfs-indef
(rule-full-fs rule)) 'needs-affix)))
(and affix (bool-value-true affix))))
;;;
;;; create feature structure representation of orthography value for insertion
;;; into the output structure of inflectional rules; somewhat more complicated
;;; than one might expect because of treatment for multi-word elements.
;;;
(defun make-orth-tdfs (orthography)
(let* ((unifications
(loop
for token in (split-into-words orthography)
for path = *orth-path* then (append path *list-tail*)
for opath = (create-path-from-feature-list
(append path *list-head*))
collect (make-unification :lhs opath
:rhs (make-u-value :type token))))
(indef (process-unifications unifications)))
(when indef
(make-tdfs :indef (create-wffs indef)))))
;;;
;;; assign priorities to parser tasks and lexical entries
;;;
(defun rule-priority (rule)
(case (rule-id rule)
(subj 1000)))
(defun gen-rule-priority (rule)
(rule-priority rule))
(defun lex-priority (mrec)
(declare (ignore mrec))
800)
(defun gen-lex-priority (fs)
(declare (ignore fs))
800)
;;;
;;; determine path and file names for lexicon and leaf type cache files.
;;;
#+:mcl
(defun lkb-tmp-dir ()
(let ((pathname (make-pathname :directory ":Users:erb"))
(tmp-dir '("tmp")))
(make-pathname
:host (pathname-host pathname) :device (pathname-device pathname)
:directory (append (pathname-directory pathname) tmp-dir)
:name (pathname-name pathname) :type (pathname-type pathname)
:version (pathname-version pathname))))
(defun set-temporary-lexicon-filenames nil
(let* ((version (or (find-symbol "*GRAMMAR-VERSION*" :common-lisp-user)
(and (find-package :lkb)
(find-symbol "*GRAMMAR-VERSION*" :lkb))))
(prefix
(if (and version (boundp version))
(remove-if-not #'alphanumericp (symbol-value version))
"lexicon")))
(setf *psorts-temp-file*
(make-pathname :name prefix
:directory (pathname-directory (lkb-tmp-dir))))
(setf *psorts-temp-index-file*
(make-pathname :name (concatenate 'string prefix ".idx")
:directory (pathname-directory (lkb-tmp-dir))))
(setf *leaf-temp-file*
(make-pathname :name (concatenate 'string prefix ".lfs")
:directory (pathname-directory (lkb-tmp-dir))))))
(defun bool-value-true (fs)
(and fs
(let ((fs-type (type-of-fs fs)))
(eql fs-type '+))))
(defun bool-value-false (fs)
(and fs
(let ((fs-type (type-of-fs fs)))
(eql fs-type '-))))
;;; Use tranducer loaded into the LKB for preprocessing
;;; Split string into words on white space, then run each
;;; word through the transducer to create a list of possibilities.
;;; Return a list of lists --- all possibilities for each word.
;;; It looks like parse() is already anticipating this possibility.
(defun preprocess-sentence-string (string)
(let ((words (split-into-words string)))
(loop for word in words
collect (transduce nil word))))
;;; Try redefining add-morphs-to-morphs to handle input
;;; that encodes morphological ambiguity (i.e., a list of
;;; lists of strings, instead of just a list of strings).
(defun add-morphs-to-morphs (preprocessed-input)
(let ((current 0)
(counter 0))
;;; preprocessed-input is a list of lists of strings.
;;; position is a list of strings representing the
;;; morphological analyses for that position.
(dolist (position preprocessed-input)
(let* ((cfrom current)
(cto (+ current 1)))
(setf (aref *morphs* current)
(loop for token in position
collect (let* ((base-word token)
(word (string-upcase base-word))
(morph-poss
(union
(filter-for-irregs
(remove-duplicates
(morph-analyse word)
:test #'equalp))
;; filter is in rules.lsp
(find-irregular-morphs word) :test #'equalp)))
(unless #+:ltemplates (template-p word) #-:ltemplates nil
(unless morph-poss
(format t "~%Word `~A' is not in lexicon." word)
(when *unknown-word-types*
(format t " Using unknown word mechanism.")))
(setf counter (+ counter 1)))
(when #-:null t #+:null (or morph-poss *unknown-word-types*)
(make-morph-edge :id counter :word base-word
:morph-results
(or morph-poss (list (list word)))
:cfrom cfrom
:cto cto))))))
(incf current))))
;;; Need to redefine add-words-to-chart() as well, since current
;;; version in lkb assumes *morphs* to be an array storing one
;;; form per chart position. Change this to an array of lists, with
;;; a list of forms per chart position. Since the lkb assumes multiple
;;; possibilities for everything else about chart positions, this change
;;; appears to be enough.
(defun add-words-to-chart (f)
(let ((current 0)
(to-be-accounted-for (make-array (list *chart-limit*)
:initial-element nil)))
;; to-be-accounted for is needed because we cannot tell that a word is
;; impossible until after the whole sentence has been processed because
;; it may be part of a multi-word
(loop
(let ((morph-poss (aref *morphs* current)))
(when (null morph-poss)
(return nil))
(incf current)
;; *morphs* is an array of lists for this grammar
(loop for m-edge in morph-poss
do (multiple-value-bind (ind-results multi-strings)
(add-word (morph-edge-word m-edge)
(morph-edge-morph-results m-edge) current
f (morph-edge-cfrom m-edge)
(morph-edge-cto m-edge))
(unless (or ind-results multi-strings)
(setf (aref to-be-accounted-for current)
(morph-edge-word m-edge)))
;; record the fact we haven't analysed this word
(dolist (mstr multi-strings)
;; wipe the record for multi-words which allow for it
(let ((words (split-into-words mstr)))
(dotimes (x (length words))
(setf (aref to-be-accounted-for (- current x))
nil)))))))
(dotimes (y current)
(when (aref to-be-accounted-for y)
(format t "~%No sign can be constructed for `~(~a~)'"
(aref to-be-accounted-for y)))))))
More information about the developers
mailing list