Change 877 by oe@mv on 2005/11/03 15:05:51 fix two problems in orthographemic processing, (a) handling of `funny' characters (#\!, #\?, #\*, and #\) as of late) and escape conventions in %letter-set, %prefix, and %suffix annotations; and (b) recursive invocation of the segmentation machinery on irregularly-derived forms, i.e. treating the irregular entries much like string-level (rather than stem-anchored) variations. also, add a `build' target to the Makefile and fix a minor bug with printing multi-word stems in derivations. Affected files ... ... //pet/oe/cheap/Makefile#4 edit ... //pet/oe/cheap/inputtoken.cpp#3 edit ... //pet/oe/cheap/inputtoken.h#2 edit ... //pet/oe/cheap/item.cpp#4 edit ... //pet/oe/cheap/item.h#4 edit ... //pet/oe/cheap/morph.cpp#3 edit ... //pet/oe/cheap/morph.h#3 edit ... //pet/oe/cheap/parse.cpp#4 edit Differences ... ==== //pet/oe/cheap/Makefile#4 (text) ==== 19,21c19,21 < ICUROOT = /lingo/local < ECLROOT = /lingo/local < LKBROOT = /user/oe/src/delphin/lkb --- > ICUROOT = /usr/local > ECLROOT = /usr/local > LKBROOT = $$DELPHINHOME/lkb 27c27 < CPPFLAGS1 = -Wall -g -DHASH_MAP_AVAIL -DUSEMMAP -DYY $(INCLUDE) \ --- > CPPFLAGS1 = -Wall -O6 -DHASH_MAP_AVAIL -DUSEMMAP -DYY $(INCLUDE) \ 30c30 < # CPPFLAGS1 = -Wall -g -O3 -DYY $(INCLUDE) --- > # CPPFLAGS1 = -Wall -g -DYY $(INCLUDE) 129c129 < # rebuild MRS library, using an installed ECL binary (in the PATH) --- > # rebuild MRS and FSPP libraries, using an installed ECL binary 138a139,161 > fspp: > ( \ > echo "(load \"$(LKBROOT)/src/general/loadup.lisp\")"; \ > echo "(compile-system \"fspp\" :force t)"; \ > ) | $(ECLROOT)/bin/ecl > ${RM} -f fspp.h > ${LN} -s $(LKBROOT)/include/fspp.h fspp.h > > > # > # > # > build: > p4 changes ../... | head -1 | awk '{print $$2}' > .version > ( cd ../../..; \ > tar zpScvf /tmp/pet.tgz \ > --exclude "*.o" --exclude "*~" \ > --exclude "*/flop-" --exclude "*/flop+" \ > pet/oe; \ > ) > mv /tmp/pet.tgz /tmp/$$(cat .version).tgz > scp /tmp/$$(cat .version).tgz mt.uio.no:/logon/www/delphin/ftp/pet/oe; > ==== //pet/oe/cheap/inputtoken.cpp#3 (text) ==== 101c101 < input_token::tsdb_derivation(int id, string orth) --- > input_token::tsdb_derivation(int id, string orth, int start, int end) 106,107c106,107 < << " " << _p << " " << _start << " " << _end < << " (\"" << orth << "\" " << _start << " " << _end << "))"; --- > << " " << _p << " " << start << " " << end > << " (\"" << orth << "\" " << start << " " << end << "))"; ==== //pet/oe/cheap/inputtoken.h#2 (text) ==== 81c81 < string tsdb_derivation(int id, string orth); --- > string tsdb_derivation(int id, string orth, int start, int end); ==== //pet/oe/cheap/item.cpp#4 (text) ==== 469c469 < return _dtrs[_keydtr]->tsdb_derivation(_id, orth); --- > return _dtrs[_keydtr]->tsdb_derivation(_id, orth, _start, _end); 677a678 > ==== //pet/oe/cheap/item.h#4 (text) ==== ==== //pet/oe/cheap/morph.cpp#3 (text) ==== 32,37c32,87 < // return next list (stuff between (balanced) parantheses) in s, starting at < // start; the position of the closing paren (relative to s) is returned in < // stop < string get_next_list(string &s, string::size_type start, < string::size_type &stop) < { --- > string morph_unescape_string(const string &s) { > > string res = ""; > > for(string::size_type i = 0; i < s.length(); i++) { > if(s[i] != '\\') > // > // move magic letter set character from |!| to |\x8|, something that can > // otherwise not appear in rule strings. > // > if(s[i] == '!') res += '\x8'; > else res += s[i]; > else { > i++; > if(i >= s.length()) > return res; > switch(s[i]) { > case '!': > res += "!"; > break; > case '?': > res += "?"; > break; > case '*': > res += "*"; > break; > case ')': > res += ")"; > break; > case '\\': > res += "\\"; > break; > default: > res += s[i]; > break; > } // switch > } // else > } // for > > return res; > > } // morph_unescape_string() > > string get_next_letter_set(string &s, > string::size_type start, > string::size_type &stop) { > > // > // _fix_me_ > // these strings come straight from undumping, i.e. are not converted; hence, > // we would have to do UniCode conversion here, prior to parsing the string: > // for `unsafe' encodings, part of a double-byte character may appear to be > // an opening or closing paren. while the active multi-byte encodings are > // UTF-8 and EUC-JP, this will not be an issue in practice. (29-oct-05; oe) > // > 38a89,96 > // > // first, find opening paren and confirm magic string `letter-set' > // > string::size_type open = s.find("(", start); > if(open == STRING_NPOS || ++open == s.length()) return string(); > if(s.substr(open, 10) != string("letter-set")) return string(); > if((open = s.find("(", open)) == STRING_NPOS > || ++open == s.length()) return string(); 40,42c98,134 < string::size_type openp = s.find("(", start); < if(openp == STRING_NPOS) < return string(); --- > // > // now, extract the actual letterset, enclosed in another pair of parens > // > string::size_type close; > bool escapep = false; > for(close = open; close < s.length(); ++close) { > if(s[close] == '\\') escapep = true; > else { > if(s[close] == ')' && !escapep) break; > escapep = false; > } // else > } //for > > if(s[close] != ')') > throw tError("invalid letter set |" + s.substr(open) +"|"); > > stop = close; > return s.substr(open, close - open); > > } // get_next_letter_set() > > string get_next_subrule(string &s, > string::size_type start, > string::size_type &stop) { > > // > // _fix_me_ > // see UniCode conversion remarks in get_next_letter_set(). (28-oct-05;oe) > // > > stop = start; > // > // first, find opening paren (the caller has already confirmed and stripped > // the magic strings `prefix' or `suffix') > // > string::size_type open = s.find("(", start); > if(open == STRING_NPOS || ++open == s.length()) return string(); 44,51c136,147 < int plevel = 0; < string::size_type closep; < for(closep = openp; closep < s.length(); ++closep) < { < if(s[closep] == '(') plevel++; < else if(s[closep] == ')') plevel--; < if(plevel == 0) break; < } --- > // > // now, extract the subrule, bounded by a non-escaped closing paren > // > string::size_type close; > bool escapep = false; > for(close = open; close < s.length(); ++close) { > if(s[close] == '\\') escapep = true; > else { > if(s[close] == ')' && !escapep) break; > escapep = false; > } // else > } //for 53,56c149,150 < if(plevel != 0) < { < throw tError("unbalanced list"); < } --- > if(s[close] != ')') > throw tError("invalid orthographemic subrule |" + s.substr(open) +"|"); 58,60c152,153 < stop = closep; < return s.substr(openp+1, closep-openp-1); < } --- > stop = close; > return s.substr(open, close - open); 61a155,156 > } // get_next_subrule() > 202,203c297,298 < while(it.hasNext()) < { --- > bool escapep = false; > while(it.hasNext()) { 205,206c300,306 < _elems.insert(c); < } --- > if(c == '\\' && !escapep) > escapep= true; > else { > _elems.insert(c); > escapep = false; > } // else > } // while 224a325 > 304c405 < if(c1 == '!') --- > if(c1 == '\x8') 341c442 < if(c == '!') --- > if(c == '\x8') 398c499 < if(path.char32At(0) != '!') --- > if(path.char32At(0) != '\x8') 467c568 < while((off = s.indexOf((UChar32) '!', off)) != -1) --- > while((off = s.indexOf((UChar32) '\x8', off)) != -1) 670,671c771 < void tMorphAnalyzer::add_global(string rule) < { --- > void tMorphAnalyzer::add_global(string rule) { 676,696c776,782 < < start = 0; stop = 1; < while(start != stop) < { < string s = get_next_list(rule, start, stop); < if(start != stop) < { < if(s.substr(0, 10) == string("letter-set")) < { < string::size_type stop; < string ls = get_next_list(s, 0, stop); < if(stop != 0) < { < _lettersets->add(ls); < } < } < else < { < fprintf(ferr, "ignoring unknown type of inflr <%s>\n", < s.c_str()); < } --- > > start = stop = 0; stop = 1; > while(start != stop) { > > string ls = get_next_letter_set(rule, start, stop); > if(start != stop) { > _lettersets->add(ls); 698,700c784,798 < } < } < } --- > } // if > else > // > // ignore remaining trailing parens when looking at the final element > // > if(rule.substr(start, 2) != "))") { > string s = rule.substr(start); > fprintf(ferr, > "ignoring remaining letter-set(s) <%s>\n", > s.c_str()); > } // if > } // while > > } // tMorphAnalyzer::add_global() > 709c807 < string subrule = get_next_list(rule, start, stop); --- > string subrule = get_next_subrule(rule, start, stop); 711a810 > subrule = morph_unescape_string(subrule); 786a886,941 > > // handle irregular forms > > // 1) suppres regular decompositions, if so desired by the grammar > > if(_irregs_only) > { > suf.splice(suf.end(), pre); > > for(list::iterator it = suf.begin(); > it != suf.end(); ++it) > if(!matching_irreg_form(*it)) pre.push_back(*it); > } > > // 2) add irregular analyses from table > > string base = form.base(); > pair::iterator, > multimap::iterator> eq = > _irregs_by_form.equal_range(base); > > for(multimap::iterator it = eq.first; > it != eq.second; ++it) { > > // > // _fix_me_ > // the following largely duplicates code from morph_trie::analyze(): here, > // we need to check the orthographemic chain so far, so as to avoid adding > // a cycle (as triggered, for example, by rules like `bet past_verb bet). > // > // > type_t candidate = it->second->rules().front(); > list rules = form.rules(); > list forms = form.forms(); > list::iterator rule; > list::iterator form; > bool cyclep = false; > for(rule = rules.begin(), form = forms.begin(); > rule != rules.end() && form != forms.end(); > ++rule, ++form) { > > if(*rule == candidate && (_duplicate_filter_p || *form == base)) { > cyclep = true; > break; > } // if > > } // for > > if(cyclep) continue; > > rules.push_front(candidate); > forms.push_front(it->second->base()); > pre.push_back(tMorphAnalysis(forms, rules)); > > } // for > 865,889d1019 < // handle irregular forms < < // 1) filter regular results if desired < < if(_irregs_only) < { < prev_results.clear(); < prev_results.splice(prev_results.end(), final_results); < < for(list::iterator it = prev_results.begin(); < it != prev_results.end(); ++it) < if(!matching_irreg_form(*it)) < final_results.push_back(*it); < } < < // 2) add irregular analyses from table < < pair::iterator, < multimap::iterator> eq = < _irregs_by_form.equal_range(form); < < for(multimap::iterator it = eq.first; < it != eq.second; ++it) < final_results.push_back(*it->second); < ==== //pet/oe/cheap/morph.h#3 (text) ==== ==== //pet/oe/cheap/parse.cpp#4 (text) ==== 522d521 <