540ee32178
Build a committed Russian common-noun word list (dictprep/russian/scrabble.txt) from the RAN orthographic dictionary, for the Эрудит ruleset. - Stage 1 (Go, dictprep/ruwords): orfo_dict_2025.txt -> all.txt; extracts headwords, reconstructs "ед." singulars (suppressing plurals), pairs "и" variants. - Stage 2 (Python brain, dictprep/ru_stage2.py): OpenCorpora (mawo-pymorphy3) + libmorph + orthographic notes select common nouns (nom. sing.); --trace explains a word's fate, --dump writes the in-memory buckets. - libmorph C++ bridge (libmorph_check.cpp); manual_confirm.txt is merged in. - orfo_dict_2025.txt is the committed pdftotext source of truth. - See dictprep/README.md for methodology and reproducibility.
48 lines
1.8 KiB
C++
48 lines
1.8 KiB
C++
// libmorph_check: a thin stdin->stdout bridge to the libmorph Russian morphological
|
|
// analyser, for use by the Stage-2 classifier (scripts/ru_stage2.py).
|
|
//
|
|
// Reads one word per line (bytes are passed through verbatim — the caller encodes to
|
|
// the code page the libmorph char interface expects, CP1251). For each word it writes
|
|
// a line:
|
|
//
|
|
// <known>\t<pos>:<lemma>\t<pos>:<lemma>...
|
|
//
|
|
// where <known> is CheckWord's result (1 = in the dictionary, 0 = not), and each
|
|
// following field is one lexeme: its part of speech (wdInfo & 0x3f) and lemma.
|
|
//
|
|
// Build: g++ -std=c++17 -O2 scripts/libmorph_check.cpp -lmorphrus -lmoonycode -o libmorph_check
|
|
#include <libmorph/rus.h>
|
|
#include <libmorph/api.hpp>
|
|
#include <cstdio>
|
|
#include <iostream>
|
|
#include <string>
|
|
|
|
int main(int argc, char** argv) {
|
|
// The factory key selects the code page: "libmorph.api.v4:<charset>". Use the
|
|
// UTF-8 instance so words pass through verbatim. IMlmaMbXX only adds non-virtual
|
|
// convenience wrappers over IMlmaMb, so the filled pointer can be used as such.
|
|
const char* key = argc > 1 ? argv[1] : "libmorph.api.v4:utf-8";
|
|
IMlmaMbXX* mlma = nullptr;
|
|
int rc = mlmaruGetAPI(key, (void**)&mlma);
|
|
if (mlma == nullptr) {
|
|
std::fprintf(stderr, "libmorph_check: GetAPI('%s') failed, rc=%d\n", key, rc);
|
|
return 1;
|
|
}
|
|
std::string line;
|
|
while (std::getline(std::cin, line)) {
|
|
if (!line.empty() && line.back() == '\r') line.pop_back();
|
|
IMlmaMbXX::inword w(line.c_str(), line.size());
|
|
int known = mlma->CheckWord(w, sfIgnoreCapitals);
|
|
std::cout << known;
|
|
try {
|
|
for (auto& lx : mlma->Lemmatize(w, sfIgnoreCapitals)) {
|
|
unsigned pos = lx.ngrams > 0 ? (lx.pgrams[0].wdInfo & 0x3f) : 0xffu;
|
|
std::cout << '\t' << pos << ':' << (lx.plemma ? lx.plemma : "");
|
|
}
|
|
} catch (...) {
|
|
}
|
|
std::cout << '\n';
|
|
}
|
|
return 0;
|
|
}
|