/* ** Copyright (C) 2023 Dirk-Jan C. Binnema ** ** This program is free software; you can redistribute it and/or modify it ** under the terms of the GNU General Public License as published by the ** Free Software Foundation; either version 3, or (at your option) any ** later version. ** ** This program is distributed in the hope that it will be useful, ** but WITHOUT ANY WARRANTY; without even the implied warranty of ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ** GNU General Public License for more details. ** ** You should have received a copy of the GNU General Public License ** along with this program; if not, write to the Free Software Foundation, ** Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. ** */ #include "mu-utils.hh" #include "mu-option.hh" #include "mu-regex.hh" #include #include #include #include using namespace Mu; static bool starts_with(std::string_view haystack, std::string_view needle) { if (needle.size() > haystack.size()) return false; for (auto&& c = 0U; c != needle.size(); ++c) if (::tolower(haystack[c]) != ::tolower(needle[c])) return false; return true; } static bool matches(std::string_view haystack, std::string_view needle) { if (needle.size() != haystack.size()) return false; else return starts_with(haystack, needle); } /** * HTML parsing context * */ class Context { public: /** * Construct a parsing context * * @param html some html to parse */ Context(const std::string& html): html_{html}, pos_{} {} /** * Are we done with the html blob, i.e, has it been fully scraped? * * @return true or false */ bool done() const { return pos_ >= html_.size(); } /** * Get the current position * * @return position */ size_t position() const { return pos_; } /** * Get the size of the HTML * * @return size */ size_t size() const { return html_.size(); } /** * Advance the position by _n_ characters. * * @param n number by which to advance. */ void advance(size_t n=1) { if (pos_ + n > html_.size()) throw std::range_error("out of range"); pos_ += n; } /** * Are we looking at the given string? * * @param str string to match (case-insensitive) * * @return true or false */ bool looking_at(std::string_view str) const { if (pos_ >= html_.size() || pos_ + str.size() >= html_.size()) return false; else return matches({html_.data()+pos_, str.size()}, str); } /** * Grab a substring-view from the html * * @param fpos starting position * @param len length * * @return string view */ std::string_view substr(size_t fpos, size_t len) const { if (fpos + len > html_.size()) throw std::range_error(mu_format("{} + {} > {}", fpos, len, html_.size())); else return { html_.data() + fpos, len }; } /** * Grab the string of alphabetic characters at the * head (pos) of the context, and advance over it. * * @return the head-word or empty */ std::string_view eat_head_word() { size_t start_pos{pos_}; while (!done()) { if (!::isalpha(html_.at(pos_))) break; ++pos_; } return {html_.data() + start_pos, pos_ - start_pos}; } /** * Get the scraped data; only available when done() * @return scraped data */ std::string scraped() { return cleanup(raw_scraped_); } /** * Get the raw scrape buffer, where we can append * scraped data. * * @return the buffer */ std::string& raw_scraped() { return raw_scraped_; } /** * Get a reference to the HTML * * @return html */ const std::string& html() const { return html_; } private: /** * Cleanup some raw scraped html: remove superfluous * whitespace, avoid too long lines. * * @param unclean * * @return cleaned up string. */ std::string cleanup(const std::string unclean) const { // reduce whitespace and avoid too long lines; // makes it easier to debug. bool was_wspace{}; size_t col{}; std::string clean; clean.reserve(unclean.size()/2); for(auto&& c: unclean) { auto wspace = c == ' ' || c == '\t' || c == '\n'; if (wspace) { was_wspace = true; continue; } ++col; if (was_wspace) { if (col > 80) { clean += '\n'; col = 0; } else if (!clean.empty()) clean += ' '; was_wspace = false; } clean += c; } return clean; } const std::string& html_; // no copy! size_t pos_{}; std::string raw_scraped_; }; G_GNUC_UNUSED static auto format_as(const Context& ctx) { return mu_format("<{}:{}: '{}'>", ctx.position(), ctx.size(), ctx.substr(ctx.position(), std::min(static_cast(8), ctx.size() - ctx.position()))); } static void skip_quoted(Context& ctx, std::string_view quote) { while(!ctx.done()) { if (ctx.looking_at(quote)) // closing quote return; ctx.advance(); } } // attempt to skip over