// This code is based on github.com/theseer/css2xpath and other sources #include #include #include #include #include #include #include #include #include #include #include #include #include #include "ecmascript/css2xpath.h" namespace std { template std::basic_string regex_replace2(BidirIt first, BidirIt last, const std::basic_regex& re, UnaryFunction f) { std::basic_string s; typename std::match_results::difference_type positionOfLastMatch = 0; auto endOfLastMatch = first; auto callback = [&](const std::match_results& match) { auto positionOfThisMatch = match.position(0); auto diff = positionOfThisMatch - positionOfLastMatch; auto startOfThisMatch = endOfLastMatch; std::advance(startOfThisMatch, diff); s.append(endOfLastMatch, startOfThisMatch); s.append(f(match)); auto lengthOfMatch = match.length(0); positionOfLastMatch = positionOfThisMatch + lengthOfMatch; endOfLastMatch = startOfThisMatch; std::advance(endOfLastMatch, lengthOfMatch); }; std::regex_iterator begin(first, last, re), end; std::for_each(begin, end, callback); s.append(endOfLastMatch, last); return s; } template std::string regex_replace2(const std::string& s, const std::basic_regex& re, UnaryFunction f) { return regex_replace2(s.cbegin(), s.cend(), re, f); } } // namespace std using namespace std; std::string implode(const char *const delim, std::vector & x) { switch (x.size()) { case 0: return std::string(""); case 1: return x[0]; default: std::ostringstream os; std::copy(x.begin(), x.end() - 1, std::ostream_iterator(os, delim)); os << *x.rbegin(); return os.str(); } } std::vector explode(char delim, std::string const & s) { std::vector result; std::istringstream iss(s); for (std::string token; std::getline(iss, token, delim); ) { result.push_back(std::move(token)); } if (s.back() == delim) { result.push_back(std::string()); } return result; } std::string preg_replace(std::string & pattern, const char *replacement, std::string & subject) { try { return std::regex_replace(subject, std::regex(pattern), replacement); } catch (const std::regex_error &e) { std::cout << e.what() << " " << pattern << "\n"; } return subject; } using namespace std; typedef std::string (*my_callback_t)(const std::smatch &m); std::string preg_replace_callback(std::string & pattern, my_callback_t callback, std::string & subject) { return std::regex_replace2(subject, std::regex(pattern), callback); } std::string dollar_equal_rule_callback(const std::smatch &matches) { std::ostringstream os; os << "[substring(@" << matches[1].str() << ",string-length(@" << matches[1].str() << ")-"; os << (matches[2].str().size() - 3) << ")=" << matches[1].str() << "]"; return os.str(); } std::string dollar_equal_rule_apply(std::string &selector) { std::string pattern("\\[([a-zA-Z0-9\\_\\-]+)\\$=([^\\]]+)\\]"); return preg_replace_callback(pattern, dollar_equal_rule_callback, selector); } class Translator; class Rule { public: virtual std::string apply(std::string &selector) { return selector; } Rule() { } }; class RegexRule : public Rule { private: const char *pattern, *replacement; public: RegexRule(const char *pat, const char *repl) : pattern(pat), replacement(repl) { } std::string apply(std::string &selector) { std::string r(pattern); return preg_replace(r, replacement, selector); } }; class NotRule : public Rule { private: Translator *t; public: NotRule(Translator *tt); std::string apply(std::string &selector); std::string callback(const std::smatch &matches); }; std::string not_rule_callback(const std::smatch &m); class NotRule *currentNotRule; static bool is_digits(const std::string &str) { return std::all_of(str.begin(), str.end(), ::isdigit); // C++11 } class NthChildRule; class NthChildRule *currentNth; std::string nth_callback(const std::smatch &m); class NthChildRule : public Rule { public: NthChildRule() { } std::string apply(std::string & selector) { currentNth = this; std::string pat("([a-zA-Z0-9_\\-*]+):nth-child\\(([^)]*)\\)"); return preg_replace_callback(pat, nth_callback, selector); } std::string callback(const std::smatch &matches) { std::ostringstream os; std::ostringstream res; os << matches[2]; if (os.str() == "n") { res << matches[1]; } else if (os.str() == "even") { res << matches[1] << "[(count(preceding-sibling::*) + 1) mod 2=0]"; } else if (os.str() == "odd") { res << matches[1] << "[(count(preceding-sibling::*) + 1) mod 2=1]"; } else if (is_digits(os.str())) { res << "*[" << matches[2] << "]/self::" << matches[1]; } else { std::string pat("^([\\d]*)n.*?([\\d]*)$"); std::string m = matches[2].str(); std::string b1 = preg_replace(pat, "$1+$2", m); auto b = explode('+', b1); int bint = atoi(b[1].c_str()); res << matches[1] << "[(count(preceding-sibling::*)+1)>=" << bint << " and ((count(preceding-sibling::*)+1)-" << bint << ") mod " << b[0] << "=0]"; } return res.str(); } }; std::string nth_callback(const std::smatch &m) { auto fun = std::bind(&NthChildRule::callback, ::currentNth, std::placeholders::_1); return fun(m); } std::string not_rule_callback(const std::smatch &m) { auto fun = std::bind(&NotRule::callback, ::currentNotRule, std::placeholders::_1); return fun(m); } class DollarEqualRule : public Rule { public: DollarEqualRule() { } std::string apply(std::string &selector) { std::string pattern("\\[([a-zA-Z0-9\\_\\-]+)\\$=([^\\]]+)\\]"); return preg_replace_callback(pattern, dollar_equal_rule_callback, selector); } }; class Translator { public: std::string translate(std::string & selector) { for (auto r : rules) { selector = r->apply(selector); } std::string quotes("\"\""); selector = preg_replace(quotes, "\"", selector); return selector == "/" ? "/" : ("//" + selector); } Translator() { } private: Rule *rules[34] = { // prefix|name new RegexRule("([a-zA-Z0-9\\_\\-\\*]+)\\|([a-zA-Z0-9\\_\\-\\*]+)", "$1:$2"), // add @ for attribs new RegexRule("\\[([^G\\]~\\$\\*\\^\\|\\!]+)(=[^\\]]+)?\\]", "[@$1$2]"), // multiple queries new RegexRule("\\s*,\\s*", "|"), // , + ~ > new RegexRule("\\s*([\\+~>])\\s*", "$1"), //* ~ + > new RegexRule("([a-zA-Z0-9\\_\\-\\*])~([a-zA-Z0-9\\_\\-\\*])", "$1/following-sibling::$2"), new RegexRule("([a-zA-Z0-9\\_\\-\\*])\\+([a-zA-Z0-9\\_\\-\\*])", "$1/following-sibling::*[1]/self::$2"), new RegexRule("([a-zA-Z0-9\\_\\-\\*])>([a-zA-Z0-9\\_\\-\\*])", "$1/$2"), // all unescaped stuff escaped new RegexRule("\\[([^=]+)=([^'|][^\\]]*)\\]", "[$1=\"$2\"]"), // all descendant or self to // new RegexRule("(^|[^a-zA-Z0-9\\_\\-\\*])([#\\.])([a-zA-Z0-9\\_\\-]+)", "$1*$2$3"), new RegexRule("([\\>\\+\\|\\~\\,\\s])([a-zA-Z\\*]+)", "$1//$2"), new RegexRule("\\s+\\/\\/", "//"), // :first-child new RegexRule("([a-zA-Z0-9\\_\\-\\*]+):first-child", "*[1]/self::$1"), // :last-child new RegexRule("([a-zA-Z0-9\\_\\-\\*]+)?:last-child", "$1[not(following-sibling::*)]"), // :only-child new RegexRule("([a-zA-Z0-9\\_\\-\\*]+):only-child", "*[last()=1]/self::$1"), // :empty new RegexRule("([a-zA-Z0-9\\_\\-\\*]+)?:empty", "$1[not(*) and not(normalize-space())]"), // :not new NotRule(this), // :nth-child new NthChildRule(), // :contains(selectors) new RegexRule(":contains\\(([^\\)]*)\\)", "[contains(string(.),\"$1\")]"), // |= attrib new RegexRule("\\[([a-zA-Z0-9\\_\\-]+)\\|=([^\\]]+)\\]", "[@$1=$2 or starts-with(@$1,concat($2,\"-\"))]"), // *= attrib new RegexRule("\\[([a-zA-Z0-9\\_\\-]+)\\*=([^\\]]+)\\]", "[contains(@$1,$2)]"), // ~= attrib new RegexRule("\\[([a-zA-Z0-9\\_\\-]+)~=([^\\]]+)\\]", "[contains(concat(\" \",normalize-space(@$1),\" \"),concat(\" \",$2,\" \"))]"), // ^= attrib new RegexRule("\\[([a-zA-Z0-9\\_\\-]+)\\^=([^\\]]+)\\]", "[starts-with(@$1,$2)]"), // $= attrib new DollarEqualRule(), // != attrib new RegexRule("\\[([a-zA-Z0-9\\_\\-]+)\\!=[\\\"']+?([^\\\"\\]]+)[\\\"']+?\\]", "[not(@$1) or @$1!=\"$2\"]"), // ids new RegexRule("#([a-zA-Z0-9\\_\\-]+)", "[@id=\"$1\"]"), // classes new RegexRule("\\.([a-zA-Z0-9_-]+)(?![^[]*])", "[contains(concat(\" \",normalize-space(@class),\" \"),\" $1 \")]"), // normalize multiple filters new RegexRule("\\]\\[([^\\]]+)", " and ($1)"), // tag:pseudo selectors new RegexRule("(:enabled)", "[not(@disabled)]"), new RegexRule("(:checked)", "[@checked=\"checked\"]"), new RegexRule(":(disabled)", "[@$1]"), new RegexRule(":root", "/"), new RegexRule(":scope", ""), // use * when tag was omitted new RegexRule("^\\[", "*["), new RegexRule("\\|\\[", "|*[") }; }; NotRule::NotRule(Translator *tt) : t(tt) { } std::string NotRule::apply(std::string &selector) { currentNotRule = this; std::string pat("([a-zA-Z0-9\\_\\-\\*]+):not\\(([^\\)]*)\\)"); return preg_replace_callback(pat, not_rule_callback, selector); } std::string NotRule::callback(const std::smatch &matches) { std::string m(matches[2].str()); std::string pat("^[^\\[]+\\[([^\\]]*)\\].*$"); std::string ret(t->translate(m)); std::string subresult = preg_replace(pat, "$1", ret); return matches[1].str() + "[not(" + subresult + ")]"; } std::string css2xpath(std::string &selector) { static Translator *translator; if (!translator) { translator = new Translator(); } return translator->translate(selector); } #if 0 std::string next_year(const std::smatch& matches) { std::ostringstream os; os << matches[1] << atoi(matches[2].str().c_str()) + 1; return os.str(); } typedef const char *test_t[2]; void tests_t() { test provider[] = { {"div", "//div"}, {"body div", "//body//div"}, {"div p", "//div//p"}, {"div > p", "//div/p"}, {"div + p", "//div/following-sibling::*[1]/self::p"}, {"div ~ p", "//div/following-sibling::p"}, {"div p a", "//div//p//a"}, {"div, p, a", "//div|//p|//a"}, {".note", "//*[contains(concat(\" \",normalize-space(@class),\" \"),\" note \")]"}, {"div.example", "//div[contains(concat(\" \",normalize-space(@class),\" \"),\" example \")]"}, {"ul .tocline2", "//ul//*[contains(concat(\" \",normalize-space(@class),\" \"),\" tocline2 \")]"}, {"div.example, div.note", "//div[contains(concat(\" \",normalize-space(@class),\" \"),\" example \")]|//div[contains(concat(\" \",normalize-space(@class),\" \"),\" note \")]"}, {"#title", "//*[@id=\"title\"]"}, {"h1#title", "//h1[@id=\"title\"]"}, {"div #title", "//div//*[@id=\"title\"]"}, {"ul.toc li.tocline2", "//ul[contains(concat(\" \",normalize-space(@class),\" \"),\" toc \")]//li[contains(concat(\" \",normalize-space(@class),\" \"),\" tocline2 \")]"}, {"ul.toc > li.tocline2", "//ul[contains(concat(\" \",normalize-space(@class),\" \"),\" toc \")]/li[contains(concat(\" \",normalize-space(@class),\" \"),\" tocline2 \")]"}, {"h1#title + div > p", "//h1[@id=\"title\"]/following-sibling::*[1]/self::div/p"}, {"h1[id]:contains(Selectors)", "//h1[@id and (contains(string(.),\"Selectors\"))]"}, {"a[href][lang][class]", "//a[@href and (@lang) and (@class)]"}, {"div[class]", "//div[@class]"}, {"div[class=example]", "//div[@class=\"example\"]"}, {"div[class^=exa]", "//div[starts-with(@class,\"exa\")]"}, {"div[class$=mple]", "//div[substring(@class,string-length(@class)-3)=class]"}, {"div[class*=e]", "//div[contains(@class,\"e\")]"}, {"div[class|=dialog]", "//div[@class=\"dialog\" or starts-with(@class,concat(\"dialog\",\"-\"))]"}, {"div[class!=made_up]", "//div[not(@class) or @class!=\"made_up\"]"}, {"div[property!=\"made_up\"]", "//div[not(@property) or @property!=\"made_up\"]"}, {"div[class~=example]", "//div[contains(concat(\" \",normalize-space(@class),\" \"),concat(\" \",\"example\",\" \"))]"}, {"div:not(.example)", "//div[not(contains(concat(\" \",normalize-space(@class),\" \"),\" example \"))]"}, {"p:contains(selectors)", "//p[contains(string(.),\"selectors\")]"}, {"p:nth-child(n)", "//p"}, {"p:nth-child(even)", "//p[(count(preceding-sibling::*) + 1) mod 2=0]"}, {"p:nth-child(odd)", "//p[(count(preceding-sibling::*) + 1) mod 2=1]"}, {"p:nth-child(3n+8)", "//p[(count(preceding-sibling::*)+1)>=8 and ((count(preceding-sibling::*)+1)-8) mod 3=0]"}, {"p:nth-child(2n+1)", "//p[(count(preceding-sibling::*)+1)>=1 and ((count(preceding-sibling::*)+1)-1) mod 2=0]"}, {"p:nth-child(3)", "//*[3]/self::p"}, {"p:nth-child(4n)", "//p[(count(preceding-sibling::*)+1)>=0 and ((count(preceding-sibling::*)+1)-0) mod 4=0]"}, {"p:only-child", "//*[last()=1]/self::p"}, {"p:last-child", "//p[not(following-sibling::*)]"}, {"p:first-child", "//*[1]/self::p"}, {"foo|bar", "//foo:bar"}, {"div[class^=exa][class$=mple]", "//div[starts-with(@class,\"exa\") and (substring(@class,string-length(@class)-3)=class)]"}, {"input:enabled", "//input[not(@disabled)]"}, {"input:checked", "//input[@checked=\"checked\"]"}, {"input:disabled", "//input[@disabled]"}, {":empty", "//*[not(*) and not(normalize-space())]"}, {":root", "/"}, {"meta[name=\"gaf\"]", "//meta[@name=\"gaf\"]"} }; for (auto t: provider) { std::string selector(t[0]); std::string expected(t[1]); std::string result = css2xpath(selector); std::cout << t[0] << " "; std::cout << ((result == expected) ? "\033[32mOK\033[0m" : "\033[31mFAIL\033[0m"); std::cout << "\n"; } } int main(int argc, char **argv) { #if 0 #if 1 std::vector x; for (int i = 2; i < argc; i++) { x.push_back(argv[i]); } std::string res = implode(argv[1], x); std::cout << res << "\n"; #endif #if 1 auto v = explode(' ', "hello world foo bar"); for (auto e : v) { std::cout << e << "\n"; } #endif #if 1 std::string str = "April 15, 2003"; std::string pattern = "(\\w+) (\\d+), (\\d+)"; const char *replacement = "$1,1,$3"; std::cout << preg_replace(pattern, replacement, str); #endif std::string text = "April fools day is 04/01/2002\n"; text += "Last christmas was 12/24/2001\n"; std::string pat2 = "(\\d{2}/\\d{2}/)(\\d{4})"; std::cout << preg_replace_callback(pat2, next_year, text); #endif #if 0 auto v = explode('+', "4+"); for (auto e : v) { std::cout << e << "\n"; } #endif tests(); return 0; } #endif