1
0
mirror of https://github.com/rkd77/elinks.git synced 2024-11-04 08:17:17 -05:00
elinks/src/ecmascript/css2xpath.cpp

561 lines
15 KiB
C++
Raw Normal View History

// This code is based on github.com/theseer/css2xpath and other sources
2021-09-28 10:25:55 -04:00
#include <string>
#include <vector>
#include <sstream>
#include <iterator>
2021-09-29 10:25:35 -04:00
#include <utility>
2021-09-30 13:15:44 -04:00
#include <regex>
#include <functional>
#include <cstdlib>
#include <iostream>
#include <cstdlib>
#include <iostream>
#include <string>
#include <regex>
2021-10-25 11:31:05 -04:00
#include "ecmascript/css2xpath.h"
namespace std
{
template<class BidirIt, class Traits, class CharT, class UnaryFunction>
std::basic_string<CharT> regex_replace2(BidirIt first, BidirIt last,
const std::basic_regex<CharT,Traits>& re, UnaryFunction f)
{
std::basic_string<CharT> s;
typename std::match_results<BidirIt>::difference_type
positionOfLastMatch = 0;
auto endOfLastMatch = first;
auto callback = [&](const std::match_results<BidirIt>& match)
{
auto positionOfThisMatch = match.position(0);
auto diff = positionOfThisMatch - positionOfLastMatch;
auto startOfThisMatch = endOfLastMatch;
std::advance(startOfThisMatch, diff);
s.append(endOfLastMatch, startOfThisMatch);
s.append(f(match));
auto lengthOfMatch = match.length(0);
positionOfLastMatch = positionOfThisMatch + lengthOfMatch;
endOfLastMatch = startOfThisMatch;
std::advance(endOfLastMatch, lengthOfMatch);
};
std::regex_iterator<BidirIt> begin(first, last, re), end;
std::for_each(begin, end, callback);
s.append(endOfLastMatch, last);
return s;
}
template<class Traits, class CharT, class UnaryFunction>
std::string regex_replace2(const std::string& s,
const std::basic_regex<CharT,Traits>& re, UnaryFunction f)
{
return regex_replace2(s.cbegin(), s.cend(), re, f);
}
} // namespace std
using namespace std;
2021-09-28 10:25:55 -04:00
std::string
2021-09-30 14:26:45 -04:00
implode(const char *const delim, std::vector<std::string> & x)
2021-09-28 10:25:55 -04:00
{
switch (x.size())
{
case 0:
return std::string("");
case 1:
return x[0];
default:
std::ostringstream os;
std::copy(x.begin(), x.end() - 1,
std::ostream_iterator<std::string>(os, delim));
os << *x.rbegin();
return os.str();
}
}
2021-09-29 10:25:35 -04:00
std::vector<std::string>
explode(char delim, std::string const & s)
{
std::vector<std::string> result;
std::istringstream iss(s);
for (std::string token; std::getline(iss, token, delim); )
{
result.push_back(std::move(token));
}
2021-10-03 16:59:47 -04:00
if (s.back() == delim)
{
result.push_back(std::string());
}
2021-09-29 10:25:35 -04:00
return result;
}
2021-09-30 13:15:44 -04:00
std::string
2021-09-30 14:26:45 -04:00
preg_replace(std::string & pattern, const char *replacement, std::string & subject)
2021-09-30 13:15:44 -04:00
{
2021-10-03 16:59:47 -04:00
try {
return std::regex_replace(subject, std::regex(pattern), replacement);
} catch (const std::regex_error &e) {
std::cout << e.what() << " " << pattern << "\n";
}
2021-10-04 09:10:03 -04:00
return subject;
2021-09-30 13:15:44 -04:00
}
using namespace std;
2021-10-06 12:41:14 -04:00
typedef std::string (*my_callback_t)(const std::smatch &m);
std::string
2021-10-06 12:41:14 -04:00
preg_replace_callback(std::string & pattern, my_callback_t callback, std::string & subject)
{
return std::regex_replace2(subject, std::regex(pattern), callback);
}
2021-10-01 14:36:35 -04:00
std::string
dollar_equal_rule_callback(const std::smatch &matches)
{
std::ostringstream os;
os << "[substring(@" << matches[1].str() << ",string-length(@" << matches[1].str() << ")-";
os << (matches[2].str().size() - 3) << ")=" << matches[1].str() << "]";
return os.str();
}
std::string
dollar_equal_rule_apply(std::string &selector)
{
std::string pattern("\\[([a-zA-Z0-9\\_\\-]+)\\$=([^\\]]+)\\]");
return preg_replace_callback(pattern, dollar_equal_rule_callback, selector);
}
2021-10-02 13:47:23 -04:00
class Translator;
class Rule
{
public:
2021-10-03 16:59:47 -04:00
virtual std::string apply(std::string &selector)
2021-10-02 13:47:23 -04:00
{
return selector;
}
Rule()
{
}
};
class RegexRule : public Rule
{
private:
const char *pattern, *replacement;
public:
RegexRule(const char *pat, const char *repl) : pattern(pat), replacement(repl)
{
}
2021-10-03 16:59:47 -04:00
std::string apply(std::string &selector)
{
std::string r(pattern);
return preg_replace(r, replacement, selector);
}
2021-10-02 13:47:23 -04:00
};
class NotRule : public Rule
{
2021-10-03 16:59:47 -04:00
private:
Translator *t;
2021-10-02 13:47:23 -04:00
public:
2021-10-03 16:59:47 -04:00
NotRule(Translator *tt);
std::string apply(std::string &selector);
std::string callback(const std::smatch &matches);
2021-10-02 13:47:23 -04:00
};
2021-10-03 16:59:47 -04:00
std::string
not_rule_callback(const std::smatch &m);
class NotRule *currentNotRule;
static bool
is_digits(const std::string &str)
{
return std::all_of(str.begin(), str.end(), ::isdigit); // C++11
}
class NthChildRule;
class NthChildRule *currentNth;
std::string
nth_callback(const std::smatch &m);
2021-10-02 13:47:23 -04:00
class NthChildRule : public Rule
{
public:
NthChildRule()
{
2021-10-03 16:59:47 -04:00
}
std::string apply(std::string & selector)
{
2021-10-04 11:21:44 -04:00
currentNth = this;
2021-10-03 16:59:47 -04:00
std::string pat("([a-zA-Z0-9_\\-*]+):nth-child\\(([^)]*)\\)");
return preg_replace_callback(pat, nth_callback, selector);
}
std::string callback(const std::smatch &matches)
{
std::ostringstream os;
std::ostringstream res;
os << matches[2];
if (os.str() == "n")
{
res << matches[1];
}
else if (os.str() == "even")
{
res << matches[1] << "[(count(preceding-sibling::*) + 1) mod 2=0]";
}
else if (os.str() == "odd")
{
res << matches[1] << "[(count(preceding-sibling::*) + 1) mod 2=1]";
}
else if (is_digits(os.str()))
{
res << "*[" << matches[2] << "]/self::" << matches[1];
}
else
{
std::string pat("^([\\d]*)n.*?([\\d]*)$");
std::string m = matches[2].str();
std::string b1 = preg_replace(pat, "$1+$2", m);
auto b = explode('+', b1);
2021-10-04 09:10:03 -04:00
int bint = atoi(b[1].c_str());
res << matches[1] << "[(count(preceding-sibling::*)+1)>=" << bint << " and ((count(preceding-sibling::*)+1)-"
<< bint << ") mod " << b[0] << "=0]";
2021-10-03 16:59:47 -04:00
}
return res.str();
2021-10-02 13:47:23 -04:00
}
};
2021-10-03 16:59:47 -04:00
std::string
nth_callback(const std::smatch &m)
{
auto fun = std::bind(&NthChildRule::callback, ::currentNth, std::placeholders::_1);
return fun(m);
}
std::string
not_rule_callback(const std::smatch &m)
{
auto fun = std::bind(&NotRule::callback, ::currentNotRule, std::placeholders::_1);
return fun(m);
}
2021-10-02 13:47:23 -04:00
class DollarEqualRule : public Rule
{
public:
DollarEqualRule()
{
}
2021-10-03 16:59:47 -04:00
std::string apply(std::string &selector)
{
std::string pattern("\\[([a-zA-Z0-9\\_\\-]+)\\$=([^\\]]+)\\]");
return preg_replace_callback(pattern, dollar_equal_rule_callback, selector);
}
2021-10-02 13:47:23 -04:00
};
class Translator
{
public:
std::string translate(std::string & selector)
{
for (auto r : rules)
{
selector = r->apply(selector);
}
2021-10-05 15:43:23 -04:00
std::string quotes("\"\"");
selector = preg_replace(quotes, "\"", selector);
2021-10-02 13:47:23 -04:00
return selector == "/" ? "/" : ("//" + selector);
}
Translator()
{
}
private:
2021-10-14 08:10:28 -04:00
Rule *rules[34] = {
2021-10-02 13:47:23 -04:00
// prefix|name
new RegexRule("([a-zA-Z0-9\\_\\-\\*]+)\\|([a-zA-Z0-9\\_\\-\\*]+)", "$1:$2"),
// add @ for attribs
new RegexRule("\\[([^G\\]~\\$\\*\\^\\|\\!]+)(=[^\\]]+)?\\]", "[@$1$2]"),
// multiple queries
new RegexRule("\\s*,\\s*", "|"),
// , + ~ >
new RegexRule("\\s*([\\+~>])\\s*", "$1"),
//* ~ + >
new RegexRule("([a-zA-Z0-9\\_\\-\\*])~([a-zA-Z0-9\\_\\-\\*])", "$1/following-sibling::$2"),
new RegexRule("([a-zA-Z0-9\\_\\-\\*])\\+([a-zA-Z0-9\\_\\-\\*])", "$1/following-sibling::*[1]/self::$2"),
new RegexRule("([a-zA-Z0-9\\_\\-\\*])>([a-zA-Z0-9\\_\\-\\*])", "$1/$2"),
// all unescaped stuff escaped
new RegexRule("\\[([^=]+)=([^'|][^\\]]*)\\]", "[$1=\"$2\"]"),
// all descendant or self to //
new RegexRule("(^|[^a-zA-Z0-9\\_\\-\\*])([#\\.])([a-zA-Z0-9\\_\\-]+)", "$1*$2$3"),
new RegexRule("([\\>\\+\\|\\~\\,\\s])([a-zA-Z\\*]+)", "$1//$2"),
2021-10-03 16:59:47 -04:00
new RegexRule("\\s+\\/\\/", "//"),
2021-10-02 13:47:23 -04:00
// :first-child
new RegexRule("([a-zA-Z0-9\\_\\-\\*]+):first-child", "*[1]/self::$1"),
// :last-child
new RegexRule("([a-zA-Z0-9\\_\\-\\*]+)?:last-child", "$1[not(following-sibling::*)]"),
// :only-child
new RegexRule("([a-zA-Z0-9\\_\\-\\*]+):only-child", "*[last()=1]/self::$1"),
// :empty
new RegexRule("([a-zA-Z0-9\\_\\-\\*]+)?:empty", "$1[not(*) and not(normalize-space())]"),
// :not
new NotRule(this),
// :nth-child
new NthChildRule(),
// :contains(selectors)
new RegexRule(":contains\\(([^\\)]*)\\)", "[contains(string(.),\"$1\")]"),
// |= attrib
new RegexRule("\\[([a-zA-Z0-9\\_\\-]+)\\|=([^\\]]+)\\]", "[@$1=$2 or starts-with(@$1,concat($2,\"-\"))]"),
// *= attrib
new RegexRule("\\[([a-zA-Z0-9\\_\\-]+)\\*=([^\\]]+)\\]", "[contains(@$1,$2)]"),
// ~= attrib
new RegexRule("\\[([a-zA-Z0-9\\_\\-]+)~=([^\\]]+)\\]", "[contains(concat(\" \",normalize-space(@$1),\" \"),concat(\" \",$2,\" \"))]"),
// ^= attrib
new RegexRule("\\[([a-zA-Z0-9\\_\\-]+)\\^=([^\\]]+)\\]", "[starts-with(@$1,$2)]"),
// $= attrib
new DollarEqualRule(),
// != attrib
new RegexRule("\\[([a-zA-Z0-9\\_\\-]+)\\!=[\\\"']+?([^\\\"\\]]+)[\\\"']+?\\]", "[not(@$1) or @$1!=\"$2\"]"),
// ids
new RegexRule("#([a-zA-Z0-9\\_\\-]+)", "[@id=\"$1\"]"),
// classes
new RegexRule("\\.([a-zA-Z0-9_-]+)(?![^[]*])", "[contains(concat(\" \",normalize-space(@class),\" \"),\" $1 \")]"),
// normalize multiple filters
new RegexRule("\\]\\[([^\\]]+)", " and ($1)"),
// tag:pseudo selectors
new RegexRule("(:enabled)", "[not(@disabled)]"),
new RegexRule("(:checked)", "[@checked=\"checked\"]"),
new RegexRule(":(disabled)", "[@$1]"),
new RegexRule(":root", "/"),
2021-10-14 08:10:28 -04:00
new RegexRule(":scope", ""),
2021-10-02 13:47:23 -04:00
// use * when tag was omitted
2021-10-03 16:59:47 -04:00
new RegexRule("^\\[", "*["),
2021-10-02 13:47:23 -04:00
new RegexRule("\\|\\[", "|*[")
};
};
2021-10-01 14:36:35 -04:00
2021-10-03 16:59:47 -04:00
NotRule::NotRule(Translator *tt) : t(tt)
{
}
std::string
NotRule::apply(std::string &selector)
{
2021-10-04 11:21:44 -04:00
currentNotRule = this;
2021-10-03 16:59:47 -04:00
std::string pat("([a-zA-Z0-9\\_\\-\\*]+):not\\(([^\\)]*)\\)");
return preg_replace_callback(pat, not_rule_callback, selector);
}
std::string
NotRule::callback(const std::smatch &matches)
{
std::string m(matches[2].str());
std::string pat("^[^\\[]+\\[([^\\]]*)\\].*$");
std::string ret(t->translate(m));
std::string subresult = preg_replace(pat, "$1", ret);
return matches[1].str() + "[not(" + subresult + ")]";
}
std::string
css2xpath(std::string &selector)
{
static Translator *translator;
2021-10-03 16:59:47 -04:00
if (!translator)
{
translator = new Translator();
}
return translator->translate(selector);
}
2021-10-03 16:59:47 -04:00
#if 0
std::string
next_year(const std::smatch& matches)
{
std::ostringstream os;
os << matches[1] << atoi(matches[2].str().c_str()) + 1;
return os.str();
}
2021-10-06 12:41:14 -04:00
typedef const char *test_t[2];
2021-10-03 13:37:19 -04:00
void
2021-10-06 12:41:14 -04:00
tests_t()
2021-10-03 13:37:19 -04:00
{
test provider[] = {
{"div", "//div"},
{"body div", "//body//div"},
{"div p", "//div//p"},
{"div > p", "//div/p"},
{"div + p", "//div/following-sibling::*[1]/self::p"},
{"div ~ p", "//div/following-sibling::p"},
{"div p a", "//div//p//a"},
{"div, p, a", "//div|//p|//a"},
{".note", "//*[contains(concat(\" \",normalize-space(@class),\" \"),\" note \")]"},
{"div.example", "//div[contains(concat(\" \",normalize-space(@class),\" \"),\" example \")]"},
{"ul .tocline2", "//ul//*[contains(concat(\" \",normalize-space(@class),\" \"),\" tocline2 \")]"},
{"div.example, div.note", "//div[contains(concat(\" \",normalize-space(@class),\" \"),\" example \")]|//div[contains(concat(\" \",normalize-space(@class),\" \"),\" note \")]"},
{"#title", "//*[@id=\"title\"]"},
{"h1#title", "//h1[@id=\"title\"]"},
{"div #title", "//div//*[@id=\"title\"]"},
{"ul.toc li.tocline2", "//ul[contains(concat(\" \",normalize-space(@class),\" \"),\" toc \")]//li[contains(concat(\" \",normalize-space(@class),\" \"),\" tocline2 \")]"},
{"ul.toc > li.tocline2", "//ul[contains(concat(\" \",normalize-space(@class),\" \"),\" toc \")]/li[contains(concat(\" \",normalize-space(@class),\" \"),\" tocline2 \")]"},
{"h1#title + div > p", "//h1[@id=\"title\"]/following-sibling::*[1]/self::div/p"},
{"h1[id]:contains(Selectors)", "//h1[@id and (contains(string(.),\"Selectors\"))]"},
{"a[href][lang][class]", "//a[@href and (@lang) and (@class)]"},
{"div[class]", "//div[@class]"},
{"div[class=example]", "//div[@class=\"example\"]"},
{"div[class^=exa]", "//div[starts-with(@class,\"exa\")]"},
{"div[class$=mple]", "//div[substring(@class,string-length(@class)-3)=class]"},
{"div[class*=e]", "//div[contains(@class,\"e\")]"},
{"div[class|=dialog]", "//div[@class=\"dialog\" or starts-with(@class,concat(\"dialog\",\"-\"))]"},
{"div[class!=made_up]", "//div[not(@class) or @class!=\"made_up\"]"},
{"div[property!=\"made_up\"]", "//div[not(@property) or @property!=\"made_up\"]"},
{"div[class~=example]", "//div[contains(concat(\" \",normalize-space(@class),\" \"),concat(\" \",\"example\",\" \"))]"},
{"div:not(.example)", "//div[not(contains(concat(\" \",normalize-space(@class),\" \"),\" example \"))]"},
{"p:contains(selectors)", "//p[contains(string(.),\"selectors\")]"},
{"p:nth-child(n)", "//p"},
{"p:nth-child(even)", "//p[(count(preceding-sibling::*) + 1) mod 2=0]"},
{"p:nth-child(odd)", "//p[(count(preceding-sibling::*) + 1) mod 2=1]"},
{"p:nth-child(3n+8)", "//p[(count(preceding-sibling::*)+1)>=8 and ((count(preceding-sibling::*)+1)-8) mod 3=0]"},
{"p:nth-child(2n+1)", "//p[(count(preceding-sibling::*)+1)>=1 and ((count(preceding-sibling::*)+1)-1) mod 2=0]"},
{"p:nth-child(3)", "//*[3]/self::p"},
{"p:nth-child(4n)", "//p[(count(preceding-sibling::*)+1)>=0 and ((count(preceding-sibling::*)+1)-0) mod 4=0]"},
{"p:only-child", "//*[last()=1]/self::p"},
{"p:last-child", "//p[not(following-sibling::*)]"},
{"p:first-child", "//*[1]/self::p"},
{"foo|bar", "//foo:bar"},
{"div[class^=exa][class$=mple]", "//div[starts-with(@class,\"exa\") and (substring(@class,string-length(@class)-3)=class)]"},
{"input:enabled", "//input[not(@disabled)]"},
{"input:checked", "//input[@checked=\"checked\"]"},
{"input:disabled", "//input[@disabled]"},
{":empty", "//*[not(*) and not(normalize-space())]"},
2021-10-05 15:43:23 -04:00
{":root", "/"},
{"meta[name=\"gaf\"]", "//meta[@name=\"gaf\"]"}
2021-10-03 13:37:19 -04:00
};
for (auto t: provider)
{
std::string selector(t[0]);
std::string expected(t[1]);
2021-10-04 11:21:44 -04:00
std::string result = css2xpath(selector);
2021-10-03 16:59:47 -04:00
std::cout << t[0] << " ";
2021-10-03 13:43:47 -04:00
std::cout << ((result == expected) ? "\033[32mOK\033[0m" : "\033[31mFAIL\033[0m");
2021-10-03 13:37:19 -04:00
std::cout << "\n";
}
}
2021-09-28 10:25:55 -04:00
int
main(int argc, char **argv)
{
2021-10-03 16:59:47 -04:00
#if 0
2021-09-30 14:26:45 -04:00
#if 1
2021-09-28 10:25:55 -04:00
std::vector<std::string> x;
for (int i = 2; i < argc; i++) {
x.push_back(argv[i]);
}
std::string res = implode(argv[1], x);
std::cout << res << "\n";
2021-09-29 10:25:35 -04:00
#endif
2021-09-30 14:26:45 -04:00
#if 1
2021-09-29 10:25:35 -04:00
auto v = explode(' ', "hello world foo bar");
for (auto e : v)
{
std::cout << e << "\n";
}
2021-09-30 13:15:44 -04:00
#endif
#if 1
2021-09-30 13:15:44 -04:00
std::string str = "April 15, 2003";
std::string pattern = "(\\w+) (\\d+), (\\d+)";
const char *replacement = "$1,1,$3";
std::cout << preg_replace(pattern, replacement, str);
#endif
std::string text = "April fools day is 04/01/2002\n";
text += "Last christmas was 12/24/2001\n";
std::string pat2 = "(\\d{2}/\\d{2}/)(\\d{4})";
2021-09-30 13:15:44 -04:00
std::cout << preg_replace_callback(pat2, next_year, text);
2021-10-03 16:59:47 -04:00
#endif
#if 0
auto v = explode('+', "4+");
for (auto e : v)
{
std::cout << e << "\n";
}
#endif
2021-09-29 10:25:35 -04:00
2021-10-03 13:37:19 -04:00
tests();
2021-09-28 10:25:55 -04:00
return 0;
}
#endif