diff --git a/lib/utils/meson.build b/lib/utils/meson.build index efb9ffc2..ddf526dc 100644 --- a/lib/utils/meson.build +++ b/lib/utils/meson.build @@ -17,6 +17,7 @@ lib_mu_utils=static_library('mu-utils', [ 'mu-command-handler.cc', + 'mu-lang-detector.cc', 'mu-logger.cc', 'mu-option.cc', 'mu-readline.cc', @@ -29,7 +30,8 @@ lib_mu_utils=static_library('mu-utils', [ gio_dep, gio_unix_dep, config_h_dep, - readline_dep + readline_dep, + cld2_dep ], include_directories: include_directories(['.','..']), install: false) @@ -72,4 +74,11 @@ test('test-logger', cpp_args: ['-DBUILD_TESTS'], dependencies: [glib_dep, lib_mu_utils_dep, thread_dep ])) +test('test-lang-detector', + executable('test-lang-detector', 'mu-lang-detector.cc', + install: false, + cpp_args: ['-DBUILD_TESTS'], + dependencies: [glib_dep, lib_mu_utils_dep, cld2_dep, config_h_dep])) + + subdir('tests') diff --git a/lib/utils/mu-lang-detector.cc b/lib/utils/mu-lang-detector.cc new file mode 100644 index 00000000..deadf2f3 --- /dev/null +++ b/lib/utils/mu-lang-detector.cc @@ -0,0 +1,100 @@ +/* +** Copyright (C) 2023 Dirk-Jan C. Binnema +** +** This program is free software; you can redistribute it and/or modify it +** under the terms of the GNU General Public License as published by the +** Free Software Foundation; either version 3, or (at your option) any +** later version. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU General Public License for more details. +** +** You should have received a copy of the GNU General Public License +** along with this program; if not, write to the Free Software Foundation, +** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +** +*/ +#include "config.h" +#include "mu-lang-detector.hh" + +using namespace Mu; + +#ifndef HAVE_CLD2 +// Dummy implementation +Option detect_language(const std::string& txt) { return Nothing; } +#else +#include +#include + +Option +Mu::detect_language(const std::string& txt) +{ + bool is_reliable; + const auto lang = CLD2::DetectLanguage( + txt.c_str(), txt.length(), + true/*plain-text*/, + &is_reliable); + + if (lang == CLD2::UNKNOWN_LANGUAGE || !is_reliable) + return {}; + + Mu::Language res = { + CLD2::LanguageName(lang), + CLD2::LanguageCode(lang) + }; + if (!res.name || !res.code) + return {}; + else + return Some(std::move(res)); +} +#endif /*HAVE_CLD2*/ + +#ifdef BUILD_TESTS +#include +#include "mu-test-utils.hh" + +static void +test_lang_detector() +{ + using Case = std::tuple; + using Cases = std::vector; + + const Cases tests = {{ + { "hello world, this is a bit of English", + "ENGLISH", "en" }, + { "En nu een paar Nederlandse woorden", + "DUTCH", "nl" }, + { "Hyvää huomenta! Puhun vähän suomea", + "FINNISH", "fi" }, + { "So eine Arbeit wird eigentlich nie fertig, man muß sie für " + "fertig erklären, wenn man nach Zeit und Umständen das " + "möglichste getan hat.", + "GERMAN", "de"} + }}; + + for (auto&& test: tests) { + const auto res = detect_language(std::get<0>(test)); +#ifndef HAVE_CLD2 + g_assert_false(!!res); +#else + g_assert_true(!!res); + assert_equal(std::get<1>(test), res->name); + assert_equal(std::get<2>(test), res->code); +#endif + + } +} + +int +main(int argc, char* argv[]) +{ + mu_test_init(&argc, &argv); + + g_test_add_func("/utils/lang-detector", test_lang_detector); + + return g_test_run(); +} + +#endif /*BUILD_TESTS*/ diff --git a/lib/utils/mu-lang-detector.hh b/lib/utils/mu-lang-detector.hh new file mode 100644 index 00000000..0b692bc1 --- /dev/null +++ b/lib/utils/mu-lang-detector.hh @@ -0,0 +1,46 @@ +/* +** Copyright (C) 2023 Dirk-Jan C. Binnema +** +** This program is free software; you can redistribute it and/or modify it +** under the terms of the GNU General Public License as published by the +** Free Software Foundation; either version 3, or (at your option) any +** later version. +** +** This program is distributed in the hope that it will be useful, +** but WITHOUT ANY WARRANTY; without even the implied warranty of +** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +** GNU General Public License for more details. +** +** You should have received a copy of the GNU General Public License +** along with this program; if not, write to the Free Software Foundation, +** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +** +*/ + +#ifndef MU_LANG_DETECTOR_HH__ +#define MU_LANG_DETECTOR_HH__ + +#include +#include "mu-option.hh" + +namespace Mu { + +struct Language { + const char *name; /**< Language name, e.g. "Dutch" */ + const char *code; /**< Language code, e.g. "nl" */ +}; + +/** + * Detect the language of text + * + * @param txt some text (UTF-8) + * + * @return either a Language or nothing; the latter + * also if we cannot not reliably determine a single language + */ +Option detect_language(const std::string& txt); + +} // namespace Mu + + +#endif /* MU_LANG_DETECTOR_HH__ */ diff --git a/meson.build b/meson.build index fe799494..e28df873 100644 --- a/meson.build +++ b/meson.build @@ -86,7 +86,7 @@ cxx.check_header('charconv', required:true) # config.h setup # config_h_data=configuration_data() -config_h_data.set_quoted('MU_STORE_SCHEMA_VERSION', '466') +config_h_data.set_quoted('MU_STORE_SCHEMA_VERSION', '467') config_h_data.set_quoted('PACKAGE_VERSION', meson.project_version()) config_h_data.set_quoted('PACKAGE_STRING', meson.project_name() + ' ' + meson.project_version()) @@ -129,6 +129,16 @@ gmime_dep = dependency('gmime-3.0', version: '>= 3.2') xapian_dep = dependency('xapian-core', version:'>= 1.4') thread_dep = dependency('threads') +# optionally, use Compact Language Detector2 if we can find it. +cld2_dep = meson.get_compiler('cpp').find_library('cld2', required: false) +if cld2_dep.found() + config_h_data.set('HAVE_CLD2', 1) +else + message('CLD2 not found; no support for language detection') +endif + + dependency('cld2', required : false) + awk=find_program(['gawk', 'awk']) gzip=find_program('gzip')