mu: index html text as if it were plain text

This is a bit of hack to include html text in results.

Of course, html text is not really plain text, so this is a bit of a
hack until we introduce some html parsing step.
This commit is contained in:
Dirk-Jan C. Binnema
2023-01-31 23:12:05 +02:00
parent ea08378ce6
commit abfa6f277c
3 changed files with 83 additions and 15 deletions

View File

@ -83,6 +83,16 @@ add_search_term(Xapian::Document& doc, const Field& field, const std::string& va
throw std::logic_error("not a search term");
}
/* hack... import html text as if it were plain text. */
static void
add_body_html(Xapian::Document& doc, const Field& field, const std::string& val)
{
static Field body_field = field_from_id(Field::Id::BodyText);
Xapian::TermGenerator termgen;
termgen.set_document(doc);
termgen.index_text(utf8_flatten(val), 1, body_field.xapian_term());
}
void
Document::add(Field::Id id, const std::string& val)
@ -94,7 +104,8 @@ Document::add(Field::Id id, const std::string& val)
if (field.is_searchable())
add_search_term(xdoc_, field, val);
else if (id == Field::Id::XBodyHtml)
add_body_html(xdoc_, field, val);
if (field.include_in_sexp()) {
put_prop(field, val);
}

View File

@ -718,9 +718,60 @@ Boo!
assert_valid_result(qr);
g_assert_cmpuint(qr->size(), ==, 3);
}
}
static void
test_html()
{
// test message sent to self, and copy of received msg.
const auto test_msg = R"(From: Test <test@example.com>
To: abc@example.com
Date: Mon, 23 May 2011 10:53:45 +0200
Subject: vla
MIME-Version: 1.0
Content-Type: multipart/alternative;
boundary="_=aspNetEmail=_5ed4592191214c7a99bd7f6a3a0f077d"
Message-ID: <10374608.109906.11909.20115aabbccdd.MSGID@mailinglijst.nl>
--_=aspNetEmail=_5ed4592191214c7a99bd7f6a3a0f077d
Content-Type: text/plain; charset="iso-8859-15"
Content-Transfer-Encoding: quoted-printable
text
--_=aspNetEmail=_5ed4592191214c7a99bd7f6a3a0f077d
Content-Type: text/html; charset="iso-8859-15"
Content-Transfer-Encoding: quoted-printable
html
--_=aspNetEmail=_5ed4592191214c7a99bd7f6a3a0f077d--
)";
const TestMap test_msgs = {{"inbox/cur/msg1", test_msg }};
TempDir tdir;
auto store{make_test_store(tdir.path(), test_msgs, {})};
g_assert_cmpuint(store.size(), ==, 1);
{
auto qr = store.run_query("body:text", Field::Id::Date,
QueryFlags::None);
assert_valid_result(qr);
g_assert_cmpuint(qr->size(), ==, 1);
}
{
auto qr = store.run_query("body:html", Field::Id::Date,
QueryFlags::None);
assert_valid_result(qr);
g_assert_cmpuint(qr->size(), ==, 1);
}
}
int
main(int argc, char* argv[])
{
@ -745,6 +796,7 @@ main(int argc, char* argv[])
test_term_split);
g_test_add_func("/store/query/related-dup-threaded",
test_related_dup_threaded);
g_test_add_func("/store/query/html", test_html);
return g_test_run();
}