lib: implement new query parser
mu's query parser is the piece of software that turns your queries
into something the Xapian database can understand. So, if you query
"maildir:/inbox and subject:bla" this must be translated into a
Xapian::Query object which will retrieve the sought after messages.
Since mu's beginning, almost a decade ago, this parser was based on
Xapian's default Xapian::QueryParser. It works okay, but wasn't really
designed for the mu use-case, and had a bit of trouble with anything
that's not A..Z (think: spaces, special characters, unicode etc.).
Over the years, mu added quite a bit of pre-processing trickery to
deal with that. Still, there were corner cases and bugs that were
practically unfixable.
The solution to all of this is to have a custom query processor that
replaces Xapian's, and write it from the ground up to deal with the
special characters etc. I wrote one, as part of my "future, post-1.0
mu" reseach project, and I have now backported it to the mu 0.9.19.
From a technical perspective, this is a major cleanup, and allows us
to get rid of much of the fragile preprocessing both for indexing and
querying. From and end-user perspective this (hopefully) means that
many of the little parsing issues are gone, and it opens the way for
some new features.
From an end-user perspective:
- better support for special characters.
- regexp search! yes, you can now search for regular expressions, e.g.
subject:/h.ll?o/
will find subjects with hallo, hello, halo, philosophy, ...
As you can imagine, this can be a _heavy_ operation on the database,
and might take quite a bit longer than a normal query; but it can be
quite useful.
This commit is contained in:
349
lib/parser/utils.cc
Normal file
349
lib/parser/utils.cc
Normal file
@ -0,0 +1,349 @@
|
||||
/*
|
||||
** Copyright (C) 2017 Dirk-Jan C. Binnema <djcb@djcbsoftware.nl>
|
||||
**
|
||||
** This library is free software; you can redistribute it and/or
|
||||
** modify it under the terms of the GNU Lesser General Public License
|
||||
** as published by the Free Software Foundation; either version 2.1
|
||||
** of the License, or (at your option) any later version.
|
||||
**
|
||||
** This library is distributed in the hope that it will be useful,
|
||||
** but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
** Lesser General Public License for more details.
|
||||
**
|
||||
** You should have received a copy of the GNU Lesser General Public
|
||||
** License along with this library; if not, write to the Free
|
||||
** Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA
|
||||
** 02110-1301, USA.
|
||||
*/
|
||||
|
||||
#define GNU_SOURCE
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include "utils.hh"
|
||||
|
||||
#include <string.h>
|
||||
#include <iostream>
|
||||
#include <algorithm>
|
||||
|
||||
#include <glib.h>
|
||||
|
||||
using namespace Mux;
|
||||
|
||||
namespace {
|
||||
|
||||
static gunichar
|
||||
unichar_tolower (gunichar uc)
|
||||
{
|
||||
if (!g_unichar_isalpha(uc))
|
||||
return uc;
|
||||
|
||||
if (g_unichar_get_script (uc) != G_UNICODE_SCRIPT_LATIN)
|
||||
return g_unichar_tolower (uc);
|
||||
|
||||
switch (uc)
|
||||
{
|
||||
case 0x00e6:
|
||||
case 0x00c6: return 'e'; /* æ */
|
||||
case 0x00f8: return 'o'; /* ø */
|
||||
case 0x0110:
|
||||
case 0x0111: return 'd'; /* đ */
|
||||
/* todo: many more */
|
||||
default: return g_unichar_tolower (uc);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* gx_utf8_flatten:
|
||||
* @str: a UTF-8 string
|
||||
* @len: the length of @str, or -1 if it is %NULL-terminated
|
||||
*
|
||||
* Flatten some UTF-8 string; that is, downcase it and remove any diacritics.
|
||||
*
|
||||
* Returns: (transfer full): a flattened string, free with g_free().
|
||||
*/
|
||||
static char*
|
||||
gx_utf8_flatten (const gchar *str, gssize len)
|
||||
{
|
||||
GString *gstr;
|
||||
char *norm, *cur;
|
||||
|
||||
g_return_val_if_fail (str, NULL);
|
||||
|
||||
norm = g_utf8_normalize (str, len, G_NORMALIZE_ALL);
|
||||
if (!norm)
|
||||
return NULL;
|
||||
|
||||
gstr = g_string_sized_new (strlen (norm));
|
||||
|
||||
for (cur = norm; cur && *cur; cur = g_utf8_next_char (cur))
|
||||
{
|
||||
gunichar uc;
|
||||
|
||||
uc = g_utf8_get_char (cur);
|
||||
if (g_unichar_combining_class (uc) != 0)
|
||||
continue;
|
||||
|
||||
g_string_append_unichar (gstr, unichar_tolower(uc));
|
||||
}
|
||||
|
||||
g_free (norm);
|
||||
|
||||
return g_string_free (gstr, FALSE);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
|
||||
std::string // gx_utf8_flatten
|
||||
Mux::utf8_flatten (const std::string& str)
|
||||
{
|
||||
char *flat = gx_utf8_flatten (str.c_str(), str.length());
|
||||
if (!flat)
|
||||
return {};
|
||||
|
||||
std::string s(flat);
|
||||
g_free (flat);
|
||||
|
||||
return s;
|
||||
}
|
||||
|
||||
|
||||
std::string
|
||||
Mux::quote (const std::string& str)
|
||||
{
|
||||
char *s = g_strescape (str.c_str(), NULL);
|
||||
if (!s)
|
||||
return {};
|
||||
|
||||
std::string res (s);
|
||||
g_free (s);
|
||||
|
||||
return "\"" + res + "\"";
|
||||
}
|
||||
|
||||
std::string
|
||||
Mux::format (const char *frm, ...)
|
||||
{
|
||||
va_list args;
|
||||
|
||||
va_start (args, frm);
|
||||
|
||||
char *s = {};
|
||||
const auto res = vasprintf (&s, frm, args);
|
||||
va_end (args);
|
||||
if (res == -1) {
|
||||
std::cerr << "string format failed" << std::endl;
|
||||
return {};
|
||||
}
|
||||
|
||||
std::string str = s;
|
||||
free (s);
|
||||
|
||||
return str;
|
||||
}
|
||||
|
||||
constexpr const auto InternalDateFormat = "%012" G_GINT64_FORMAT;
|
||||
constexpr const char InternalDateMin[] = "000000000000";
|
||||
constexpr const char InternalDateMax[] = "999999999999";
|
||||
static_assert(sizeof(InternalDateMin) == 12 + 1);
|
||||
static_assert(sizeof(InternalDateMax) == 12 + 1);
|
||||
|
||||
static std::string
|
||||
date_boundary (bool is_first)
|
||||
{
|
||||
return is_first ? InternalDateMin : InternalDateMax;
|
||||
}
|
||||
|
||||
std::string
|
||||
Mux::date_to_time_t_string (time_t t)
|
||||
{
|
||||
char buf[sizeof(InternalDateMax)];
|
||||
snprintf (buf, sizeof(buf), InternalDateFormat, t);
|
||||
|
||||
return buf;
|
||||
}
|
||||
|
||||
|
||||
static std::string
|
||||
delta_ymwdhMs (const std::string& expr)
|
||||
{
|
||||
char *endptr;
|
||||
auto num = strtol (expr.c_str(), &endptr, 10);
|
||||
if (num <= 0 || num > 9999 || !endptr || !*endptr)
|
||||
return date_boundary (true);
|
||||
|
||||
int years, months, weeks, days, hours, minutes, seconds;
|
||||
years = months = weeks = days = hours = minutes = seconds = 0;
|
||||
|
||||
switch (endptr[0]) {
|
||||
case 's': seconds = num; break;
|
||||
case 'M': minutes = num; break;
|
||||
case 'h': hours = num; break;
|
||||
case 'd': days = num; break;
|
||||
case 'w': weeks = num; break;
|
||||
case 'm': months = num; break;
|
||||
case 'y': years = num; break;
|
||||
default:
|
||||
return date_boundary (true);
|
||||
}
|
||||
|
||||
GDateTime *then, *now = g_date_time_new_now_local ();
|
||||
if (weeks != 0)
|
||||
then = g_date_time_add_weeks (now, -weeks);
|
||||
else
|
||||
then = g_date_time_add_full (now, -years, -months,-days,
|
||||
-hours, -minutes, -seconds);
|
||||
|
||||
time_t t = MAX (0, (gint64)g_date_time_to_unix (then));
|
||||
|
||||
g_date_time_unref (then);
|
||||
g_date_time_unref (now);
|
||||
|
||||
return date_to_time_t_string (t);
|
||||
}
|
||||
|
||||
|
||||
static std::string
|
||||
special_date (const std::string& d, bool is_first)
|
||||
{
|
||||
if (d == "now")
|
||||
return date_to_time_t_string (time(NULL));
|
||||
|
||||
else if (d == "today") {
|
||||
|
||||
GDateTime *dt, *midnight;
|
||||
dt = g_date_time_new_now_local ();
|
||||
|
||||
if (!is_first) {
|
||||
GDateTime *tmp = dt;
|
||||
dt = g_date_time_add_days (dt, 1);
|
||||
g_date_time_unref (tmp);
|
||||
}
|
||||
|
||||
midnight = g_date_time_add_full (dt, 0, 0, 0,
|
||||
-g_date_time_get_hour(dt),
|
||||
-g_date_time_get_minute (dt),
|
||||
-g_date_time_get_second (dt));
|
||||
time_t t = MAX(0, (gint64)g_date_time_to_unix (midnight));
|
||||
g_date_time_unref (dt);
|
||||
g_date_time_unref (midnight);
|
||||
return date_to_time_t_string ((time_t)t);
|
||||
|
||||
} else
|
||||
return date_boundary (is_first);
|
||||
}
|
||||
|
||||
|
||||
constexpr const char UserDateMin[] = "19700101000000";
|
||||
constexpr const char UserDateMax[] = "29993112235959";
|
||||
|
||||
std::string
|
||||
Mux::date_to_time_t_string (const std::string& dstr, bool is_first)
|
||||
{
|
||||
gint64 t;
|
||||
struct tm tbuf;
|
||||
GDateTime *dtime;
|
||||
|
||||
/* one-sided dates */
|
||||
if (dstr.empty())
|
||||
return date_boundary (is_first);
|
||||
else if (is_first && dstr.find_first_of("ymdwhMs") != std::string::npos)
|
||||
return delta_ymwdhMs (dstr);
|
||||
|
||||
std::string date (is_first ? UserDateMin : UserDateMax);
|
||||
std::copy_if (dstr.begin(), dstr.end(), date.begin(),[](auto c){return isdigit(c);});
|
||||
|
||||
memset (&tbuf, 0, sizeof tbuf);
|
||||
if (!strptime (date.c_str(), "%Y%m%d%H%M%S", &tbuf) &&
|
||||
!strptime (date.c_str(), "%Y%m%d%H%M", &tbuf) &&
|
||||
!strptime (date.c_str(), "%Y%m%d", &tbuf) &&
|
||||
!strptime (date.c_str(), "%Y%m", &tbuf) &&
|
||||
!strptime (date.c_str(), "%Y", &tbuf))
|
||||
return special_date (date, is_first);
|
||||
|
||||
dtime = g_date_time_new_local (tbuf.tm_year + 1900,
|
||||
tbuf.tm_mon + 1,
|
||||
tbuf.tm_mday,
|
||||
tbuf.tm_hour,
|
||||
tbuf.tm_min,
|
||||
tbuf.tm_sec);
|
||||
if (!dtime) {
|
||||
g_warning ("invalid %s date '%s'",
|
||||
is_first ? "lower" : "upper", date.c_str());
|
||||
return date_boundary (is_first);
|
||||
}
|
||||
|
||||
t = (gint64)g_date_time_to_unix (dtime);
|
||||
g_date_time_unref (dtime);
|
||||
|
||||
if (t < 0 || t > 9999999999)
|
||||
return date_boundary (is_first);
|
||||
else
|
||||
return date_to_time_t_string (t);
|
||||
}
|
||||
|
||||
|
||||
constexpr const auto SizeFormat = "%010" G_GINT64_FORMAT;
|
||||
|
||||
constexpr const char SizeMin[] = "0000000000";
|
||||
constexpr const char SizeMax[] = "9999999999";
|
||||
static_assert(sizeof(SizeMin) == 10 + 1);
|
||||
static_assert(sizeof(SizeMax) == 10 + 1);
|
||||
|
||||
static std::string
|
||||
size_boundary (bool is_first)
|
||||
{
|
||||
return is_first ? SizeMin : SizeMax;
|
||||
}
|
||||
|
||||
std::string
|
||||
Mux::size_to_string (int64_t size)
|
||||
{
|
||||
char buf[sizeof(SizeMax)];
|
||||
snprintf (buf, sizeof(buf), SizeFormat, size);
|
||||
|
||||
return buf;
|
||||
}
|
||||
|
||||
std::string
|
||||
Mux::size_to_string (const std::string& val, bool is_first)
|
||||
{
|
||||
std::string str;
|
||||
GRegex *rx;
|
||||
GMatchInfo *minfo;
|
||||
|
||||
/* one-sided ranges */
|
||||
if (val.empty())
|
||||
return size_boundary (is_first);
|
||||
|
||||
rx = g_regex_new ("(\\d+)(b|k|kb|m|mb|g|gb)?",
|
||||
G_REGEX_CASELESS, (GRegexMatchFlags)0, NULL);
|
||||
minfo = NULL;
|
||||
if (g_regex_match (rx, val.c_str(), (GRegexMatchFlags)0, &minfo)) {
|
||||
gint64 size;
|
||||
char *s;
|
||||
|
||||
s = g_match_info_fetch (minfo, 1);
|
||||
size = atoll (s);
|
||||
g_free (s);
|
||||
|
||||
s = g_match_info_fetch (minfo, 2);
|
||||
switch (s ? g_ascii_tolower(s[0]) : 0) {
|
||||
case 'k': size *= 1024; break;
|
||||
case 'm': size *= (1024 * 1024); break;
|
||||
case 'g': size *= (1024 * 1024 * 1024); break;
|
||||
default: break;
|
||||
}
|
||||
|
||||
g_free (s);
|
||||
str = size_to_string (size);
|
||||
} else
|
||||
str = size_boundary (is_first);
|
||||
|
||||
g_regex_unref (rx);
|
||||
g_match_info_unref (minfo);
|
||||
|
||||
return str;
|
||||
}
|
||||
Reference in New Issue
Block a user