Change 877 by oe@mv on 2005/11/03 15:05:51

	fix two problems in orthographemic processing, (a) handling of `funny'
	characters (#\!, #\?, #\*, and #\) as of late) and escape conventions
	in %letter-set, %prefix, and %suffix annotations; and (b) recursive
	invocation of the segmentation machinery on irregularly-derived forms,
	i.e. treating the irregular entries much like string-level (rather than
	stem-anchored) variations.  also, add a `build' target to the Makefile
	and fix a minor bug with printing multi-word stems in derivations.

Affected files ...

... //pet/oe/cheap/Makefile#4 edit
... //pet/oe/cheap/inputtoken.cpp#3 edit
... //pet/oe/cheap/inputtoken.h#2 edit
... //pet/oe/cheap/item.cpp#4 edit
... //pet/oe/cheap/item.h#4 edit
... //pet/oe/cheap/morph.cpp#3 edit
... //pet/oe/cheap/morph.h#3 edit
... //pet/oe/cheap/parse.cpp#4 edit

Differences ...

==== //pet/oe/cheap/Makefile#4 (text) ====

19,21c19,21
< ICUROOT = /lingo/local
< ECLROOT = /lingo/local
< LKBROOT = /user/oe/src/delphin/lkb
---
> ICUROOT = /usr/local
> ECLROOT = /usr/local
> LKBROOT = $$DELPHINHOME/lkb
27c27
< CPPFLAGS1 = -Wall -g -DHASH_MAP_AVAIL -DUSEMMAP -DYY $(INCLUDE) \
---
> CPPFLAGS1 = -Wall -O6  -DHASH_MAP_AVAIL -DUSEMMAP -DYY $(INCLUDE) \
30c30
< # CPPFLAGS1 = -Wall -g -O3 -DYY $(INCLUDE)
---
> # CPPFLAGS1 = -Wall -g -DYY $(INCLUDE)
129c129
< # rebuild MRS library, using an installed ECL binary (in the PATH)
---
> # rebuild MRS and FSPP libraries, using an installed ECL binary
138a139,161
> fspp:
> 	( \
> 	  echo "(load \"$(LKBROOT)/src/general/loadup.lisp\")"; \
> 	  echo "(compile-system \"fspp\" :force t)"; \
> 	) | $(ECLROOT)/bin/ecl
> 	${RM} -f fspp.h
> 	${LN} -s $(LKBROOT)/include/fspp.h fspp.h
> 
> 
> #
> # 
> #
> build:
> 	p4 changes ../... | head -1 | awk '{print $$2}' > .version
> 	( cd ../../..; \
> 	  tar zpScvf /tmp/pet.tgz \
> 	      --exclude "*.o" --exclude "*~" \
>               --exclude "*/flop-" --exclude "*/flop+" \
> 	      pet/oe; \
> 	)
> 	mv /tmp/pet.tgz /tmp/$$(cat .version).tgz
> 	scp /tmp/$$(cat .version).tgz mt.uio.no:/logon/www/delphin/ftp/pet/oe;
> 

==== //pet/oe/cheap/inputtoken.cpp#3 (text) ====

101c101
< input_token::tsdb_derivation(int id, string orth)
---
> input_token::tsdb_derivation(int id, string orth, int start, int end)
106,107c106,107
<         << " " << _p << " " << _start <<  " " << _end
<         << " (\"" << orth << "\" " << _start << " " << _end << "))";
---
>         << " " << _p << " " << start <<  " " << end
>         << " (\"" << orth << "\" " << start << " " << end << "))";

==== //pet/oe/cheap/inputtoken.h#2 (text) ====

81c81
<   string tsdb_derivation(int id, string orth);
---
>   string tsdb_derivation(int id, string orth, int start, int end);

==== //pet/oe/cheap/item.cpp#4 (text) ====

469c469
<     return _dtrs[_keydtr]->tsdb_derivation(_id, orth);
---
>     return _dtrs[_keydtr]->tsdb_derivation(_id, orth, _start, _end);
677a678
> 

==== //pet/oe/cheap/item.h#4 (text) ====


==== //pet/oe/cheap/morph.cpp#3 (text) ====

32,37c32,87
< // return next list (stuff between (balanced) parantheses) in s, starting at
< // start; the position of the closing paren (relative to s) is returned in
< // stop
< string get_next_list(string &s, string::size_type start,
<                      string::size_type &stop)
< {
---
> string morph_unescape_string(const string &s) {
> 
>   string res = "";
> 
>   for(string::size_type i = 0; i < s.length(); i++) {
>     if(s[i] != '\\')
>       //
>       // move magic letter set character from |!| to |\x8|, something that can
>       // otherwise not appear in rule strings.
>       //
>       if(s[i] == '!') res += '\x8';
>       else res += s[i];
>     else {
>       i++;
>       if(i >= s.length())
>         return res;
>       switch(s[i]) {
>       case '!':
>         res += "!";
>         break;
>       case '?':
>         res += "?";
>         break;
>       case '*':
>         res += "*";
>         break;
>       case ')':
>         res += ")";
>         break;
>       case '\\':
>         res += "\\";
>         break;
>       default:
>         res += s[i];
>         break;
>       } // switch
>     } // else
>   } // for
> 
>   return res;
> 
> } // morph_unescape_string()
> 
> string get_next_letter_set(string &s, 
>                            string::size_type start,
>                            string::size_type &stop) {
> 
>   //
>   // _fix_me_
>   // these strings come straight from undumping, i.e. are not converted; hence,
>   // we would have to do UniCode conversion here, prior to parsing the string:
>   // for `unsafe' encodings, part of a double-byte character may appear to be
>   // an opening or closing paren.  while the active multi-byte encodings are 
>   // UTF-8 and EUC-JP, this will not be an issue in practice.   (29-oct-05; oe)
>   //
> 
38a89,96
>   //
>   // first, find opening paren and confirm magic string `letter-set'
>   //
>   string::size_type open = s.find("(", start);
>   if(open == STRING_NPOS || ++open == s.length()) return string();
>   if(s.substr(open, 10) != string("letter-set")) return string();
>   if((open = s.find("(", open)) == STRING_NPOS
>      || ++open == s.length()) return string();
40,42c98,134
<   string::size_type openp = s.find("(", start);
<   if(openp == STRING_NPOS)
<     return string();
---
>   //
>   // now, extract the actual letterset, enclosed in another pair of parens
>   //
>   string::size_type close;
>   bool escapep = false;
>   for(close = open; close < s.length(); ++close) {
>     if(s[close] == '\\') escapep = true;
>     else {
>       if(s[close] == ')' && !escapep) break;
>       escapep = false;
>     } // else
>   } //for
>   
>   if(s[close] != ')') 
>     throw tError("invalid letter set |" + s.substr(open) +"|");
> 
>   stop = close;
>   return s.substr(open, close - open);
> 
> } // get_next_letter_set()
> 
> string get_next_subrule(string &s, 
>                         string::size_type start,
>                         string::size_type &stop) {
> 
>   //
>   // _fix_me_
>   // see UniCode conversion remarks in get_next_letter_set().    (28-oct-05;oe)
>   //
> 
>   stop = start;
>   //
>   // first, find opening paren (the caller has already confirmed and stripped 
>   // the magic strings `prefix' or `suffix')
>   //
>   string::size_type open = s.find("(", start);
>   if(open == STRING_NPOS || ++open == s.length()) return string();
44,51c136,147
<   int plevel = 0;
<   string::size_type closep;
<   for(closep = openp; closep < s.length(); ++closep)
<   {
<     if(s[closep] == '(') plevel++;
<     else if(s[closep] == ')') plevel--;
<     if(plevel == 0) break;
<   }
---
>   //
>   // now, extract the subrule, bounded by a non-escaped closing paren
>   //
>   string::size_type close;
>   bool escapep = false;
>   for(close = open; close < s.length(); ++close) {
>     if(s[close] == '\\') escapep = true;
>     else {
>       if(s[close] == ')' && !escapep) break;
>       escapep = false;
>     } // else
>   } //for
53,56c149,150
<   if(plevel != 0)
<   {
<     throw tError("unbalanced list");
<   }
---
>   if(s[close] != ')') 
>     throw tError("invalid orthographemic subrule |" + s.substr(open) +"|");
58,60c152,153
<   stop = closep;
<   return s.substr(openp+1, closep-openp-1);
< }
---
>   stop = close;
>   return s.substr(open, close - open);
61a155,156
> } // get_next_subrule()
> 
202,203c297,298
<   while(it.hasNext())
<   {
---
>   bool escapep = false;
>   while(it.hasNext()) {
205,206c300,306
<     _elems.insert(c);
<   }
---
>     if(c == '\\' && !escapep) 
>       escapep= true;
>     else {
>       _elems.insert(c);
>       escapep = false;
>     } // else
>   } // while
224a325
> 
304c405
<     if(c1 == '!')
---
>     if(c1 == '\x8')
341c442
<     if(c == '!')
---
>     if(c == '\x8')
398c499
<     if(path.char32At(0) != '!')
---
>     if(path.char32At(0) != '\x8')
467c568
<   while((off = s.indexOf((UChar32) '!', off)) != -1)
---
>   while((off = s.indexOf((UChar32) '\x8', off)) != -1)
670,671c771
< void tMorphAnalyzer::add_global(string rule)
< {
---
> void tMorphAnalyzer::add_global(string rule) {
676,696c776,782
<   
<   start = 0; stop = 1;
<   while(start != stop)
<   {
<     string s = get_next_list(rule, start, stop);
<     if(start != stop)
<     {
<       if(s.substr(0, 10) == string("letter-set"))
<       {
<         string::size_type stop;
<         string ls = get_next_list(s, 0, stop);
<         if(stop != 0)
<         {
<           _lettersets->add(ls);
<         }
<       }
<       else
<       {
<         fprintf(ferr, "ignoring unknown type of inflr <%s>\n",
< 		      s.c_str());
<       }
---
> 
>   start = stop = 0; stop = 1;
>   while(start != stop) {
> 
>     string ls = get_next_letter_set(rule, start, stop);
>     if(start != stop) {
>       _lettersets->add(ls);
698,700c784,798
<     }
<   }
< }
---
>     } // if
>     else 
>       //
>       // ignore remaining trailing parens when looking at the final element
>       //
>       if(rule.substr(start, 2) != "))") {
>         string s = rule.substr(start);
>         fprintf(ferr,
>                 "ignoring remaining letter-set(s) <%s>\n",
>                 s.c_str());
>       } // if
>   } // while
> 
> } // tMorphAnalyzer::add_global()
> 
709c807
<     string subrule = get_next_list(rule, start, stop);
---
>     string subrule = get_next_subrule(rule, start, stop);
711a810
>       subrule = morph_unescape_string(subrule);
786a886,941
> 
>   // handle irregular forms
> 
>   // 1) suppres regular decompositions, if so desired by the grammar
> 
>   if(_irregs_only)
>   {
>     suf.splice(suf.end(), pre);
> 
>     for(list<tMorphAnalysis>::iterator it = suf.begin();
>         it != suf.end(); ++it)
>       if(!matching_irreg_form(*it)) pre.push_back(*it);
>   }
> 
>   // 2) add irregular analyses from table
> 
>   string base = form.base();
>   pair<multimap<string, tMorphAnalysis *>::iterator,
>     multimap<string, tMorphAnalysis *>::iterator> eq =
>     _irregs_by_form.equal_range(base);
> 
>   for(multimap<string, tMorphAnalysis *>::iterator it = eq.first;
>       it != eq.second; ++it) {
> 
>     //
>     // _fix_me_
>     // the following largely duplicates code from morph_trie::analyze(): here,
>     // we need to check the orthographemic chain so far, so as to avoid adding
>     // a cycle (as triggered, for example, by rules like `bet past_verb bet).
>     // 
>     //
>     type_t candidate = it->second->rules().front();
>     list<type_t> rules = form.rules();
>     list<string> forms = form.forms();
>     list<type_t>::iterator rule;
>     list<string>::iterator form;
>     bool cyclep = false;
>     for(rule = rules.begin(), form = forms.begin();
>         rule != rules.end() && form != forms.end();
>         ++rule, ++form) {
> 
>       if(*rule == candidate && (_duplicate_filter_p || *form == base)) {
>         cyclep = true;
>         break;
>       } // if
> 
>     } // for
> 
>     if(cyclep) continue;
> 
>     rules.push_front(candidate);
>     forms.push_front(it->second->base());
>     pre.push_back(tMorphAnalysis(forms, rules));
> 
>   } // for
> 
865,889d1019
<   // handle irregular forms
< 
<   // 1) filter regular results if desired
< 
<   if(_irregs_only)
<   {
<     prev_results.clear();
<     prev_results.splice(prev_results.end(), final_results);
< 
<     for(list<tMorphAnalysis>::iterator it = prev_results.begin();
<         it != prev_results.end(); ++it)
<       if(!matching_irreg_form(*it))
<         final_results.push_back(*it);
<   }
< 
<   // 2) add irregular analyses from table
< 
<   pair<multimap<string, tMorphAnalysis *>::iterator,
<     multimap<string, tMorphAnalysis *>::iterator> eq =
<     _irregs_by_form.equal_range(form);
< 
<   for(multimap<string, tMorphAnalysis *>::iterator it = eq.first;
<       it != eq.second; ++it)
<     final_results.push_back(*it->second);
< 

==== //pet/oe/cheap/morph.h#3 (text) ====


==== //pet/oe/cheap/parse.cpp#4 (text) ====

522d521
<