// libmorph_check: a thin stdin->stdout bridge to the libmorph Russian morphological // analyser, for use by the Stage-2 classifier (scripts/ru_stage2.py). // // Reads one word per line (bytes are passed through verbatim — the caller encodes to // the code page the libmorph char interface expects, CP1251). For each word it writes // a line: // // \t:\t:... // // where is CheckWord's result (1 = in the dictionary, 0 = not), and each // following field is one lexeme: its part of speech (wdInfo & 0x3f) and lemma. // // Build: g++ -std=c++17 -O2 scripts/libmorph_check.cpp -lmorphrus -lmoonycode -o libmorph_check #include #include #include #include #include int main(int argc, char** argv) { // The factory key selects the code page: "libmorph.api.v4:". Use the // UTF-8 instance so words pass through verbatim. IMlmaMbXX only adds non-virtual // convenience wrappers over IMlmaMb, so the filled pointer can be used as such. const char* key = argc > 1 ? argv[1] : "libmorph.api.v4:utf-8"; IMlmaMbXX* mlma = nullptr; int rc = mlmaruGetAPI(key, (void**)&mlma); if (mlma == nullptr) { std::fprintf(stderr, "libmorph_check: GetAPI('%s') failed, rc=%d\n", key, rc); return 1; } std::string line; while (std::getline(std::cin, line)) { if (!line.empty() && line.back() == '\r') line.pop_back(); IMlmaMbXX::inword w(line.c_str(), line.size()); int known = mlma->CheckWord(w, sfIgnoreCapitals); std::cout << known; try { for (auto& lx : mlma->Lemmatize(w, sfIgnoreCapitals)) { unsigned pos = lx.ngrams > 0 ? (lx.pgrams[0].wdInfo & 0x3f) : 0xffu; std::cout << '\t' << pos << ':' << (lx.plemma ? lx.plemma : ""); } } catch (...) { } std::cout << '\n'; } return 0; }