Files
scrabble-dictionary/tools/libmorph_check.cpp
T
Ilia Denisov dd61ff1d51
build / dawg (pull_request) Successful in 4m22s
Tidy sources into sources/<variant>/ + tools/
Consolidate the scattered build inputs (dictionaries/english/, dictprep/russian/)
into one sources/ tree keyed by the variant labels (scrabble_en/scrabble_ru/
erudit_ru), and move the Russian prep pipeline to tools/. The dawg outputs and
their filenames are unchanged — rebuilt byte-identical (en_sowpods/ru_scrabble/
ru_erudit) — so the release artifact and the backend are unaffected.

ru_stage2.py OUT_DIR and the ruwords flag defaults are repointed to
sources/scrabble_ru/; Makefile / CI / cmd/builddict default / README updated;
pipeline intermediates git-ignored. Verified: make dawg byte-identical to the
committed baseline, py_compile + go vet of the moved tools. The full Russian
regeneration pipeline (pymorphy3/libmorph/orfo PDF) was not run here.
2026-06-09 12:25:33 +02:00

48 lines
1.8 KiB
C++

// libmorph_check: a thin stdin->stdout bridge to the libmorph Russian morphological
// analyser, for use by the Stage-2 classifier (scripts/ru_stage2.py).
//
// Reads one word per line (bytes are passed through verbatim — the caller encodes to
// the code page the libmorph char interface expects, CP1251). For each word it writes
// a line:
//
// <known>\t<pos>:<lemma>\t<pos>:<lemma>...
//
// where <known> is CheckWord's result (1 = in the dictionary, 0 = not), and each
// following field is one lexeme: its part of speech (wdInfo & 0x3f) and lemma.
//
// Build: g++ -std=c++17 -O2 scripts/libmorph_check.cpp -lmorphrus -lmoonycode -o libmorph_check
#include <libmorph/rus.h>
#include <libmorph/api.hpp>
#include <cstdio>
#include <iostream>
#include <string>
int main(int argc, char** argv) {
// The factory key selects the code page: "libmorph.api.v4:<charset>". Use the
// UTF-8 instance so words pass through verbatim. IMlmaMbXX only adds non-virtual
// convenience wrappers over IMlmaMb, so the filled pointer can be used as such.
const char* key = argc > 1 ? argv[1] : "libmorph.api.v4:utf-8";
IMlmaMbXX* mlma = nullptr;
int rc = mlmaruGetAPI(key, (void**)&mlma);
if (mlma == nullptr) {
std::fprintf(stderr, "libmorph_check: GetAPI('%s') failed, rc=%d\n", key, rc);
return 1;
}
std::string line;
while (std::getline(std::cin, line)) {
if (!line.empty() && line.back() == '\r') line.pop_back();
IMlmaMbXX::inword w(line.c_str(), line.size());
int known = mlma->CheckWord(w, sfIgnoreCapitals);
std::cout << known;
try {
for (auto& lx : mlma->Lemmatize(w, sfIgnoreCapitals)) {
unsigned pos = lx.ngrams > 0 ? (lx.pgrams[0].wdInfo & 0x3f) : 0xffu;
std::cout << '\t' << pos << ':' << (lx.plemma ? lx.plemma : "");
}
} catch (...) {
}
std::cout << '\n';
}
return 0;
}