url.cpp
· 17 KiB · C++
Eredeti
#include "common/url.h"
#include <fmt/format.h>
#include <cctype>
using namespace ProtoRock::Http;
enum EncodingMode {
encodePath = 1,
encodePathSegment,
encodeHost,
encodeZone,
encodeUserPassword,
encodeQueryComponent,
encodeFragment,
};
char unhex(char c) {
if ('0' <= c && c <= '9') {
return c - '0';
}
if ('a' <= c && c <= 'f') {
return c - 'a' + 10;
}
if ('A' <= c && c <= 'F') {
return c - 'A' + 10;
}
return 0;
}
const char *upperhex = "0123456789ABCDEF";
// Return true if the specified character should be escaped when
// appearing in a URL string, according to RFC 3986.
//
// Please be informed that for now shouldEscape does not check all
// reserved characters correctly. See golang.org/issue/5684.
bool shouldEscape(char c, EncodingMode mode) {
// §2.3 Unreserved characters (alphanum)
if ('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9') {
return false;
}
if (mode == encodeHost || mode == encodeZone) {
// §3.2.2 Host allows
// sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
// as part of reg-name.
// We add : because we include :port as part of host.
// We add [ ] because we include [ipv6]:port as part of host.
// We add < > because they're the only characters left that
// we could possibly allow, and Parse will reject them if we
// escape them (because hosts can't use %-encoding for
// ASCII bytes).
switch (c) {
case '!':
case '$':
case '&':
case '\'':
case '(':
case ')':
case '*':
case '+':
case ',':
case ';':
case '=':
case ':':
case '[':
case ']':
case '<':
case '>':
case '"':
return false;
}
}
switch (c) {
// §2.3 Unreserved characters (mark)
case '-':
case '_':
case '.':
case '~':
return false;
// §2.2 Reserved characters (reserved)
case '$':
case '&':
case '+':
case ',':
case '/':
case ':':
case ';':
case '=':
case '?':
case '@':
// Different sections of the URL allow a few of
// the reserved characters to appear unescaped.
switch (mode) {
case encodePath: // §3.3
// The RFC allows : @ & = + $ but saves / ; , for assigning
// meaning to individual path segments. This package
// only manipulates the path as a whole, so we allow those
// last three as well. That leaves only ? to escape.
return c == '?';
case encodePathSegment: // §3.3
// The RFC allows : @ & = + $ but saves / ; , for assigning
// meaning to individual path segments.
return c == '/' || c == ';' || c == ',' || c == '?';
case encodeUserPassword: // §3.2.1
// The RFC allows ';', ':', '&', '=', '+', '$', and ',' in
// userinfo, so we must escape only '@', '/', and '?'.
// The parsing of userinfo treats ':' as special so we must escape
// that too.
return c == '@' || c == '/' || c == '?' || c == ':';
case encodeQueryComponent: // §3.4
// The RFC reserves (so we must escape) everything.
return true;
case encodeFragment: // §4.1
// The RFC text is silent but the grammar allows
// everything: case so escape nothing.
return false;
}
}
if (mode == encodeFragment) {
// RFC 3986 §2.2 allows not escaping sub-delims. A subset of sub-delims are
// included in reserved from RFC 2396 §2.2. The remaining sub-delims do not
// need to be escaped. To minimize potential breakage, we apply two restrictions:
// (1) we always escape sub-delims outside of the fragment, and (2) we always
// escape single quote to avoid breaking callers that had previously assumed that
// single quotes would be escaped. See issue #19917.
switch (c) {
case '!':
case '(':
case ')':
case '*':
return false;
}
}
return true;
}
std::string escape(const std::string &s, EncodingMode mode) {
auto spaceCount = 0;
auto hexCount = 0;
for (auto i = 0; i < s.size(); i++) {
auto c = s[i];
if (shouldEscape(c, mode)) {
if (c == ' ' && mode == encodeQueryComponent) {
spaceCount++;
} else {
hexCount++;
}
}
}
if (spaceCount == 0 && hexCount == 0) {
return s;
}
auto required = s.size() + 2 * hexCount;
auto t = std::vector<char>();
t.reserve(required);
if (hexCount == 0) {
t.insert(t.begin(), s.begin(), s.end());
for (auto i = 0; i < s.size(); i++) {
if (s[i] == ' ') {
t[i] = '+';
}
}
return std::string(t.begin(), t.end());
}
auto j = 0;
auto c = 0;
for (auto i = 0; i < s.size(); i++) {
auto c = s[i];
if (c == ' ' && mode == encodeQueryComponent) {
t[j] = '+';
j++;
} else if (shouldEscape(c, mode)) {
t[j] = '%';
t[j + 1] = upperhex[c >> 4];
t[j + 2] = upperhex[c & 15];
j += 3;
} else {
t[j] = s[i];
j++;
}
}
return std::string(t.begin(), t.end());
}
std::string unescape(std::string s, EncodingMode mode) {
// Count %, check that they're well-formed.
auto n = 0;
auto hasPlus = false;
auto tmp = std::string();
auto v = 0;
for (int i = 0; i < s.size();) {
switch (s[i]) {
case '%':
n++;
if (i + 2 >= s.size() || !std::isxdigit(s[i + 1]) || !std::isxdigit(s[i + 2])) {
s = std::string(s.begin() + 1, s.end());
if (s.size() > 3) {
s = std::string(s.begin(), s.begin() + 3);
}
throw std::invalid_argument("escape error: " + s);
}
// Per https://tools.ietf.org/html/rfc3986#page-21
// in the host component %-encoding can only be used
// for non-ASCII bytes.
// But https://tools.ietf.org/html/rfc6874#section-2
// introduces %25 being allowed to escape a percent sign
// in IPv6 scoped-address literals. Yay.
tmp = std::string(s.begin() + i, s.begin() + i + 3);
if (mode == encodeHost && unhex(s[i + 1]) < 8 && tmp != "%25") {
throw std::invalid_argument("escape error: " + tmp);
}
if (mode == encodeZone) {
// RFC 6874 says basically "anything goes" for zone identifiers
// and that even non-ASCII can be redundantly escaped,
// but it seems prudent to restrict %-escaped bytes here to those
// that are valid host name bytes in their unescaped form.
// That is, you can use escaping in the zone identifier but not
// to introduce bytes you couldn't just write directly.
// But Windows puts spaces here! Yay.
v = unhex(s[i + 1]) << 4 | unhex(s[i + 2]);
tmp = std::string(s.begin() + i, s.begin() + i + 3);
if (tmp != "%25" && v != ' ' && shouldEscape(v, encodeHost)) {
throw std::invalid_argument("escape error: " + tmp);
}
}
i += 3;
break;
case '+':
hasPlus = mode == encodeQueryComponent;
i++;
break;
default:
if ((mode == encodeHost || mode == encodeZone) && (uint8_t)s[i] < 0x80 && shouldEscape(s[i], mode)) {
tmp = std::string(s.begin() + i, s.begin() + i + 1);
throw std::invalid_argument("invalid host: " + tmp);
}
i++;
}
}
if (n == 0 && !hasPlus) {
return s;
}
auto ss = std::stringstream();
for (int i = 0; i < s.size(); i++) {
switch (s[i]) {
case '%':
ss << (char)(unhex(s[i + 1]) << 4 | unhex(s[i + 2]));
i += 2;
break;
case '+':
ss << ((mode == encodeQueryComponent) ? ' ' : '+');
break;
default:
ss << s[i];
}
}
return ss.str();
}
bool stringContainsCTLByte(const std::string &s) {
for (auto c : s) {
if (c < ' ' || c == 0x7f) {
return true;
}
}
return false;
}
std::string getScheme(std::string &url) {
int i = 0;
std::string scheme;
for (auto &c : url) {
if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z')) {
// Do nothing
} else if (('0' <= c && c <= '9') || c == '+' || c == '-' || c == '.') {
if (i == 0) {
break;
}
} else if (c == ':') {
scheme = std::string(url.begin(), url.begin() + i);
url = std::string(url.begin() + i + 1, url.end());
break;
} else {
// we have encountered an invalid character,
// so there is no valid scheme
break;
}
i++;
}
return scheme;
}
// validUserinfo reports whether s is a valid userinfo string per RFC 3986
// Section 3.2.1:
// userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
// unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
// sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
// / "*" / "+" / "," / ";" / "="
//
// It doesn't validate pct-encoded. The caller does that via func unescape.
bool validUserinfo(const std::string &s) {
for (auto r : s) {
if ('A' <= r && r <= 'Z') {
continue;
}
if ('a' <= r && r <= 'z') {
continue;
}
if ('0' <= r && r <= '9') {
continue;
}
switch (r) {
case '-':
case '.':
case '_':
case ':':
case '~':
case '!':
case '$':
case '&':
case '\'':
case '(':
case ')':
case '*':
case '+':
case ',':
case ';':
case '=':
case '%':
case '@':
continue;
default:
return false;
}
}
return true;
}
// validOptionalPort reports whether port is either an empty string
// or matches /^:\d*$/
bool validOptionalPort(const std::string &port) {
if (port.empty()) {
return true;
}
if (port[0] != ':') {
return false;
}
for (auto b = port.begin() + 1; b < port.end(); b++) {
if (*b < '0' || *b > '9') {
return false;
}
}
return true;
}
// parseHost parses host as an authority without user
// information. That is, as host[:port].
std::string parseHost(const std::string &host) {
int idx;
if (!host.empty() && host[0] == '[') {
// Parse an IP-Literal in RFC 3986 and RFC 6874.
// E.g., "[fe80::1]", "[fe80::1%25en0]", "[fe80::1]:80".
idx = host.find_last_of(']');
if (idx >= host.size()) {
throw std::invalid_argument("cannot find ']' in host");
}
auto colonPort = std::string(host.begin() + idx + 1, host.end());
if (!validOptionalPort(colonPort)) {
throw std::invalid_argument(fmt::format("invalid port {} after host", colonPort));
}
// RFC 6874 defines that %25 (%-encoded percent) introduces
// the zone identifier, and the zone identifier can use basically
// any %-encoding it likes. That's different from the host, which
// can only %-encode non-ASCII bytes.
// We do impose some restrictions on the zone, to avoid stupidity
// like newlines.
auto zone = host.find("%25");
if (idx != std::string::npos) {
auto host1 = unescape(std::string(host.begin(), host.begin() + zone), encodeHost);
auto host2 = unescape(std::string(host.begin() + zone, host.begin() + idx), encodeHost);
auto host3 = unescape(std::string(host.begin() + idx, host.end()), encodeZone);
return host1 + host2 + host3;
}
} else if ((idx = host.find_last_of(':')) < host.size()) {
auto colonPort = std::string(host.begin() + idx, host.end());
if (!validOptionalPort(colonPort)) {
throw std::invalid_argument(fmt::format("invalid port {} after host", colonPort));
}
}
return unescape(host, encodeHost);
}
void parseAuthority(const std::string &authority, UserInfo &ui, std::string &host) {
auto i = authority.find_last_of('@');
if (i > authority.size()) {
host = parseHost(authority);
} else {
host = parseHost(std::string(authority.begin() + i + 1, authority.end()));
}
if (i > authority.size()) {
return;
}
auto userInfo = std::string(authority.begin(), authority.begin() + i);
if (!validUserinfo(userInfo)) {
throw std::invalid_argument("invalid userinfo");
}
auto idx = userInfo.find(':');
if (idx == std::string::npos) {
userInfo = unescape(userInfo, encodeUserPassword);
ui = UserInfo(userInfo);
} else {
auto username = std::string(userInfo.begin(), userInfo.begin() + idx);
auto password = std::string(userInfo.begin() + idx, userInfo.end());
ui.Username = unescape(username, encodeUserPassword);
ui.Password = unescape(username, encodeUserPassword);
}
}
URL URL::Parse(std::string url) {
URL u;
std::string frag;
auto hashIndex = url.find("#");
if (hashIndex != std::string::npos) {
frag = std::string(url.begin() + hashIndex, url.end());
url = std::string(url.begin(), url.begin() + hashIndex);
}
u.setFragment(frag);
if (stringContainsCTLByte(url)) {
throw std::invalid_argument("invalid url: string contains control bytes");
}
if (url == "*") {
u.Path = "*";
return u;
}
auto rest = url;
u.Scheme = getScheme(rest);
std::transform(u.Scheme.begin(), u.Scheme.end(), u.Scheme.begin(), [](unsigned char c) -> unsigned char { return std::tolower(c); });
if (!rest.empty() > 0 && rest[rest.size() - 1] == '?') {
u.ForceQuery = true;
rest.pop_back();
} else {
auto idx = rest.find("?");
if (idx != std::string::npos) {
u.RawQuery = std::string(rest.begin() + idx, rest.end());
rest = std::string(rest.begin(), rest.begin() + idx);
}
}
if (!rest.empty() && rest[0] != '/') {
if (!u.Scheme.empty()) {
// We consider rootless paths per RFC 3986 as opaque.
u.Opaque = rest;
return u;
}
}
if (!u.Scheme.empty() || (rest.find("///") != 0 && rest.find("//") == 0)) {
auto authority = std::string(rest.begin() + 2, rest.end());
rest = "";
int i = authority.find("/");
if (i != std::string::npos) {
rest = std::string(authority.begin() + i, authority.end());
authority = std::string(authority.begin(), authority.begin() + i);
}
parseAuthority(authority, u.User, u.Host);
}
u.setPath(rest);
return u;
}
void URL::setFragment(const std::string &f) {
Fragment = unescape(f, encodeFragment);
auto escf = escape(Fragment, encodeFragment);
RawFragment = (escf == f) ? "" : f;
}
void URL::setPath(const std::string &p) {
Path = unescape(p, encodePath);
auto escp = escape(Path, encodePath);
RawPath = (escp == p) ? "" : p;
}
std::string URL::PathEscape(const std::string &path) { return escape(path, encodePath); }
std::string URL::PathUnescape(const std::string &path) { return unescape(path, encodePath); }
std::string URL::QueryEscape(const std::string &query) { return escape(query, encodeQueryComponent); }
std::string URL::QueryUnescape(const std::string &query) { return unescape(query, encodeQueryComponent); }
| 1 | #include "common/url.h" |
| 2 | |
| 3 | #include <fmt/format.h> |
| 4 | |
| 5 | #include <cctype> |
| 6 | |
| 7 | using namespace ProtoRock::Http; |
| 8 | |
| 9 | enum EncodingMode { |
| 10 | encodePath = 1, |
| 11 | encodePathSegment, |
| 12 | encodeHost, |
| 13 | encodeZone, |
| 14 | encodeUserPassword, |
| 15 | encodeQueryComponent, |
| 16 | encodeFragment, |
| 17 | }; |
| 18 | |
| 19 | char unhex(char c) { |
| 20 | if ('0' <= c && c <= '9') { |
| 21 | return c - '0'; |
| 22 | } |
| 23 | if ('a' <= c && c <= 'f') { |
| 24 | return c - 'a' + 10; |
| 25 | } |
| 26 | if ('A' <= c && c <= 'F') { |
| 27 | return c - 'A' + 10; |
| 28 | } |
| 29 | return 0; |
| 30 | } |
| 31 | |
| 32 | const char *upperhex = "0123456789ABCDEF"; |
| 33 | |
| 34 | // Return true if the specified character should be escaped when |
| 35 | // appearing in a URL string, according to RFC 3986. |
| 36 | // |
| 37 | // Please be informed that for now shouldEscape does not check all |
| 38 | // reserved characters correctly. See golang.org/issue/5684. |
| 39 | bool shouldEscape(char c, EncodingMode mode) { |
| 40 | // §2.3 Unreserved characters (alphanum) |
| 41 | if ('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9') { |
| 42 | return false; |
| 43 | } |
| 44 | if (mode == encodeHost || mode == encodeZone) { |
| 45 | // §3.2.2 Host allows |
| 46 | // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" |
| 47 | // as part of reg-name. |
| 48 | // We add : because we include :port as part of host. |
| 49 | // We add [ ] because we include [ipv6]:port as part of host. |
| 50 | // We add < > because they're the only characters left that |
| 51 | // we could possibly allow, and Parse will reject them if we |
| 52 | // escape them (because hosts can't use %-encoding for |
| 53 | // ASCII bytes). |
| 54 | switch (c) { |
| 55 | case '!': |
| 56 | case '$': |
| 57 | case '&': |
| 58 | case '\'': |
| 59 | case '(': |
| 60 | case ')': |
| 61 | case '*': |
| 62 | case '+': |
| 63 | case ',': |
| 64 | case ';': |
| 65 | case '=': |
| 66 | case ':': |
| 67 | case '[': |
| 68 | case ']': |
| 69 | case '<': |
| 70 | case '>': |
| 71 | case '"': |
| 72 | return false; |
| 73 | } |
| 74 | } |
| 75 | |
| 76 | switch (c) { |
| 77 | // §2.3 Unreserved characters (mark) |
| 78 | case '-': |
| 79 | case '_': |
| 80 | case '.': |
| 81 | case '~': |
| 82 | return false; |
| 83 | |
| 84 | // §2.2 Reserved characters (reserved) |
| 85 | case '$': |
| 86 | case '&': |
| 87 | case '+': |
| 88 | case ',': |
| 89 | case '/': |
| 90 | case ':': |
| 91 | case ';': |
| 92 | case '=': |
| 93 | case '?': |
| 94 | case '@': |
| 95 | // Different sections of the URL allow a few of |
| 96 | // the reserved characters to appear unescaped. |
| 97 | switch (mode) { |
| 98 | case encodePath: // §3.3 |
| 99 | // The RFC allows : @ & = + $ but saves / ; , for assigning |
| 100 | // meaning to individual path segments. This package |
| 101 | // only manipulates the path as a whole, so we allow those |
| 102 | // last three as well. That leaves only ? to escape. |
| 103 | return c == '?'; |
| 104 | |
| 105 | case encodePathSegment: // §3.3 |
| 106 | // The RFC allows : @ & = + $ but saves / ; , for assigning |
| 107 | // meaning to individual path segments. |
| 108 | return c == '/' || c == ';' || c == ',' || c == '?'; |
| 109 | |
| 110 | case encodeUserPassword: // §3.2.1 |
| 111 | // The RFC allows ';', ':', '&', '=', '+', '$', and ',' in |
| 112 | // userinfo, so we must escape only '@', '/', and '?'. |
| 113 | // The parsing of userinfo treats ':' as special so we must escape |
| 114 | // that too. |
| 115 | return c == '@' || c == '/' || c == '?' || c == ':'; |
| 116 | |
| 117 | case encodeQueryComponent: // §3.4 |
| 118 | // The RFC reserves (so we must escape) everything. |
| 119 | return true; |
| 120 | |
| 121 | case encodeFragment: // §4.1 |
| 122 | // The RFC text is silent but the grammar allows |
| 123 | // everything: case so escape nothing. |
| 124 | return false; |
| 125 | } |
| 126 | } |
| 127 | |
| 128 | if (mode == encodeFragment) { |
| 129 | // RFC 3986 §2.2 allows not escaping sub-delims. A subset of sub-delims are |
| 130 | // included in reserved from RFC 2396 §2.2. The remaining sub-delims do not |
| 131 | // need to be escaped. To minimize potential breakage, we apply two restrictions: |
| 132 | // (1) we always escape sub-delims outside of the fragment, and (2) we always |
| 133 | // escape single quote to avoid breaking callers that had previously assumed that |
| 134 | // single quotes would be escaped. See issue #19917. |
| 135 | switch (c) { |
| 136 | case '!': |
| 137 | case '(': |
| 138 | case ')': |
| 139 | case '*': |
| 140 | return false; |
| 141 | } |
| 142 | } |
| 143 | |
| 144 | return true; |
| 145 | } |
| 146 | |
| 147 | std::string escape(const std::string &s, EncodingMode mode) { |
| 148 | auto spaceCount = 0; |
| 149 | auto hexCount = 0; |
| 150 | |
| 151 | for (auto i = 0; i < s.size(); i++) { |
| 152 | auto c = s[i]; |
| 153 | if (shouldEscape(c, mode)) { |
| 154 | if (c == ' ' && mode == encodeQueryComponent) { |
| 155 | spaceCount++; |
| 156 | } else { |
| 157 | hexCount++; |
| 158 | } |
| 159 | } |
| 160 | } |
| 161 | |
| 162 | if (spaceCount == 0 && hexCount == 0) { |
| 163 | return s; |
| 164 | } |
| 165 | |
| 166 | auto required = s.size() + 2 * hexCount; |
| 167 | auto t = std::vector<char>(); |
| 168 | t.reserve(required); |
| 169 | |
| 170 | if (hexCount == 0) { |
| 171 | t.insert(t.begin(), s.begin(), s.end()); |
| 172 | for (auto i = 0; i < s.size(); i++) { |
| 173 | if (s[i] == ' ') { |
| 174 | t[i] = '+'; |
| 175 | } |
| 176 | } |
| 177 | return std::string(t.begin(), t.end()); |
| 178 | } |
| 179 | |
| 180 | auto j = 0; |
| 181 | auto c = 0; |
| 182 | for (auto i = 0; i < s.size(); i++) { |
| 183 | auto c = s[i]; |
| 184 | if (c == ' ' && mode == encodeQueryComponent) { |
| 185 | t[j] = '+'; |
| 186 | j++; |
| 187 | } else if (shouldEscape(c, mode)) { |
| 188 | t[j] = '%'; |
| 189 | t[j + 1] = upperhex[c >> 4]; |
| 190 | t[j + 2] = upperhex[c & 15]; |
| 191 | j += 3; |
| 192 | } else { |
| 193 | t[j] = s[i]; |
| 194 | j++; |
| 195 | } |
| 196 | } |
| 197 | |
| 198 | return std::string(t.begin(), t.end()); |
| 199 | } |
| 200 | |
| 201 | std::string unescape(std::string s, EncodingMode mode) { |
| 202 | // Count %, check that they're well-formed. |
| 203 | auto n = 0; |
| 204 | auto hasPlus = false; |
| 205 | auto tmp = std::string(); |
| 206 | auto v = 0; |
| 207 | for (int i = 0; i < s.size();) { |
| 208 | switch (s[i]) { |
| 209 | case '%': |
| 210 | n++; |
| 211 | if (i + 2 >= s.size() || !std::isxdigit(s[i + 1]) || !std::isxdigit(s[i + 2])) { |
| 212 | s = std::string(s.begin() + 1, s.end()); |
| 213 | if (s.size() > 3) { |
| 214 | s = std::string(s.begin(), s.begin() + 3); |
| 215 | } |
| 216 | throw std::invalid_argument("escape error: " + s); |
| 217 | } |
| 218 | // Per https://tools.ietf.org/html/rfc3986#page-21 |
| 219 | // in the host component %-encoding can only be used |
| 220 | // for non-ASCII bytes. |
| 221 | // But https://tools.ietf.org/html/rfc6874#section-2 |
| 222 | // introduces %25 being allowed to escape a percent sign |
| 223 | // in IPv6 scoped-address literals. Yay. |
| 224 | tmp = std::string(s.begin() + i, s.begin() + i + 3); |
| 225 | if (mode == encodeHost && unhex(s[i + 1]) < 8 && tmp != "%25") { |
| 226 | throw std::invalid_argument("escape error: " + tmp); |
| 227 | } |
| 228 | |
| 229 | if (mode == encodeZone) { |
| 230 | // RFC 6874 says basically "anything goes" for zone identifiers |
| 231 | // and that even non-ASCII can be redundantly escaped, |
| 232 | // but it seems prudent to restrict %-escaped bytes here to those |
| 233 | // that are valid host name bytes in their unescaped form. |
| 234 | // That is, you can use escaping in the zone identifier but not |
| 235 | // to introduce bytes you couldn't just write directly. |
| 236 | // But Windows puts spaces here! Yay. |
| 237 | v = unhex(s[i + 1]) << 4 | unhex(s[i + 2]); |
| 238 | tmp = std::string(s.begin() + i, s.begin() + i + 3); |
| 239 | if (tmp != "%25" && v != ' ' && shouldEscape(v, encodeHost)) { |
| 240 | throw std::invalid_argument("escape error: " + tmp); |
| 241 | } |
| 242 | } |
| 243 | i += 3; |
| 244 | break; |
| 245 | case '+': |
| 246 | hasPlus = mode == encodeQueryComponent; |
| 247 | i++; |
| 248 | break; |
| 249 | default: |
| 250 | if ((mode == encodeHost || mode == encodeZone) && (uint8_t)s[i] < 0x80 && shouldEscape(s[i], mode)) { |
| 251 | tmp = std::string(s.begin() + i, s.begin() + i + 1); |
| 252 | throw std::invalid_argument("invalid host: " + tmp); |
| 253 | } |
| 254 | i++; |
| 255 | } |
| 256 | } |
| 257 | |
| 258 | if (n == 0 && !hasPlus) { |
| 259 | return s; |
| 260 | } |
| 261 | |
| 262 | auto ss = std::stringstream(); |
| 263 | for (int i = 0; i < s.size(); i++) { |
| 264 | switch (s[i]) { |
| 265 | case '%': |
| 266 | ss << (char)(unhex(s[i + 1]) << 4 | unhex(s[i + 2])); |
| 267 | i += 2; |
| 268 | break; |
| 269 | case '+': |
| 270 | ss << ((mode == encodeQueryComponent) ? ' ' : '+'); |
| 271 | break; |
| 272 | default: |
| 273 | ss << s[i]; |
| 274 | } |
| 275 | } |
| 276 | return ss.str(); |
| 277 | } |
| 278 | |
| 279 | bool stringContainsCTLByte(const std::string &s) { |
| 280 | for (auto c : s) { |
| 281 | if (c < ' ' || c == 0x7f) { |
| 282 | return true; |
| 283 | } |
| 284 | } |
| 285 | return false; |
| 286 | } |
| 287 | |
| 288 | std::string getScheme(std::string &url) { |
| 289 | int i = 0; |
| 290 | std::string scheme; |
| 291 | for (auto &c : url) { |
| 292 | if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z')) { |
| 293 | // Do nothing |
| 294 | } else if (('0' <= c && c <= '9') || c == '+' || c == '-' || c == '.') { |
| 295 | if (i == 0) { |
| 296 | break; |
| 297 | } |
| 298 | } else if (c == ':') { |
| 299 | scheme = std::string(url.begin(), url.begin() + i); |
| 300 | url = std::string(url.begin() + i + 1, url.end()); |
| 301 | break; |
| 302 | } else { |
| 303 | // we have encountered an invalid character, |
| 304 | // so there is no valid scheme |
| 305 | break; |
| 306 | } |
| 307 | i++; |
| 308 | } |
| 309 | return scheme; |
| 310 | } |
| 311 | |
| 312 | // validUserinfo reports whether s is a valid userinfo string per RFC 3986 |
| 313 | // Section 3.2.1: |
| 314 | // userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) |
| 315 | // unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" |
| 316 | // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" |
| 317 | // / "*" / "+" / "," / ";" / "=" |
| 318 | // |
| 319 | // It doesn't validate pct-encoded. The caller does that via func unescape. |
| 320 | bool validUserinfo(const std::string &s) { |
| 321 | for (auto r : s) { |
| 322 | if ('A' <= r && r <= 'Z') { |
| 323 | continue; |
| 324 | } |
| 325 | if ('a' <= r && r <= 'z') { |
| 326 | continue; |
| 327 | } |
| 328 | if ('0' <= r && r <= '9') { |
| 329 | continue; |
| 330 | } |
| 331 | switch (r) { |
| 332 | case '-': |
| 333 | case '.': |
| 334 | case '_': |
| 335 | case ':': |
| 336 | case '~': |
| 337 | case '!': |
| 338 | case '$': |
| 339 | case '&': |
| 340 | case '\'': |
| 341 | case '(': |
| 342 | case ')': |
| 343 | case '*': |
| 344 | case '+': |
| 345 | case ',': |
| 346 | case ';': |
| 347 | case '=': |
| 348 | case '%': |
| 349 | case '@': |
| 350 | continue; |
| 351 | default: |
| 352 | return false; |
| 353 | } |
| 354 | } |
| 355 | return true; |
| 356 | } |
| 357 | |
| 358 | // validOptionalPort reports whether port is either an empty string |
| 359 | // or matches /^:\d*$/ |
| 360 | bool validOptionalPort(const std::string &port) { |
| 361 | if (port.empty()) { |
| 362 | return true; |
| 363 | } |
| 364 | if (port[0] != ':') { |
| 365 | return false; |
| 366 | } |
| 367 | for (auto b = port.begin() + 1; b < port.end(); b++) { |
| 368 | if (*b < '0' || *b > '9') { |
| 369 | return false; |
| 370 | } |
| 371 | } |
| 372 | return true; |
| 373 | } |
| 374 | |
| 375 | // parseHost parses host as an authority without user |
| 376 | // information. That is, as host[:port]. |
| 377 | std::string parseHost(const std::string &host) { |
| 378 | int idx; |
| 379 | if (!host.empty() && host[0] == '[') { |
| 380 | // Parse an IP-Literal in RFC 3986 and RFC 6874. |
| 381 | // E.g., "[fe80::1]", "[fe80::1%25en0]", "[fe80::1]:80". |
| 382 | idx = host.find_last_of(']'); |
| 383 | if (idx >= host.size()) { |
| 384 | throw std::invalid_argument("cannot find ']' in host"); |
| 385 | } |
| 386 | auto colonPort = std::string(host.begin() + idx + 1, host.end()); |
| 387 | if (!validOptionalPort(colonPort)) { |
| 388 | throw std::invalid_argument(fmt::format("invalid port {} after host", colonPort)); |
| 389 | } |
| 390 | // RFC 6874 defines that %25 (%-encoded percent) introduces |
| 391 | // the zone identifier, and the zone identifier can use basically |
| 392 | // any %-encoding it likes. That's different from the host, which |
| 393 | // can only %-encode non-ASCII bytes. |
| 394 | // We do impose some restrictions on the zone, to avoid stupidity |
| 395 | // like newlines. |
| 396 | auto zone = host.find("%25"); |
| 397 | if (idx != std::string::npos) { |
| 398 | auto host1 = unescape(std::string(host.begin(), host.begin() + zone), encodeHost); |
| 399 | auto host2 = unescape(std::string(host.begin() + zone, host.begin() + idx), encodeHost); |
| 400 | auto host3 = unescape(std::string(host.begin() + idx, host.end()), encodeZone); |
| 401 | return host1 + host2 + host3; |
| 402 | } |
| 403 | } else if ((idx = host.find_last_of(':')) < host.size()) { |
| 404 | auto colonPort = std::string(host.begin() + idx, host.end()); |
| 405 | if (!validOptionalPort(colonPort)) { |
| 406 | throw std::invalid_argument(fmt::format("invalid port {} after host", colonPort)); |
| 407 | } |
| 408 | } |
| 409 | |
| 410 | return unescape(host, encodeHost); |
| 411 | } |
| 412 | |
| 413 | void parseAuthority(const std::string &authority, UserInfo &ui, std::string &host) { |
| 414 | auto i = authority.find_last_of('@'); |
| 415 | if (i > authority.size()) { |
| 416 | host = parseHost(authority); |
| 417 | } else { |
| 418 | host = parseHost(std::string(authority.begin() + i + 1, authority.end())); |
| 419 | } |
| 420 | |
| 421 | if (i > authority.size()) { |
| 422 | return; |
| 423 | } |
| 424 | |
| 425 | auto userInfo = std::string(authority.begin(), authority.begin() + i); |
| 426 | if (!validUserinfo(userInfo)) { |
| 427 | throw std::invalid_argument("invalid userinfo"); |
| 428 | } |
| 429 | auto idx = userInfo.find(':'); |
| 430 | if (idx == std::string::npos) { |
| 431 | userInfo = unescape(userInfo, encodeUserPassword); |
| 432 | ui = UserInfo(userInfo); |
| 433 | } else { |
| 434 | auto username = std::string(userInfo.begin(), userInfo.begin() + idx); |
| 435 | auto password = std::string(userInfo.begin() + idx, userInfo.end()); |
| 436 | ui.Username = unescape(username, encodeUserPassword); |
| 437 | ui.Password = unescape(username, encodeUserPassword); |
| 438 | } |
| 439 | } |
| 440 | |
| 441 | URL URL::Parse(std::string url) { |
| 442 | URL u; |
| 443 | std::string frag; |
| 444 | auto hashIndex = url.find("#"); |
| 445 | if (hashIndex != std::string::npos) { |
| 446 | frag = std::string(url.begin() + hashIndex, url.end()); |
| 447 | url = std::string(url.begin(), url.begin() + hashIndex); |
| 448 | } |
| 449 | |
| 450 | u.setFragment(frag); |
| 451 | |
| 452 | if (stringContainsCTLByte(url)) { |
| 453 | throw std::invalid_argument("invalid url: string contains control bytes"); |
| 454 | } |
| 455 | |
| 456 | if (url == "*") { |
| 457 | u.Path = "*"; |
| 458 | return u; |
| 459 | } |
| 460 | auto rest = url; |
| 461 | |
| 462 | u.Scheme = getScheme(rest); |
| 463 | std::transform(u.Scheme.begin(), u.Scheme.end(), u.Scheme.begin(), [](unsigned char c) -> unsigned char { return std::tolower(c); }); |
| 464 | |
| 465 | if (!rest.empty() > 0 && rest[rest.size() - 1] == '?') { |
| 466 | u.ForceQuery = true; |
| 467 | rest.pop_back(); |
| 468 | } else { |
| 469 | auto idx = rest.find("?"); |
| 470 | if (idx != std::string::npos) { |
| 471 | u.RawQuery = std::string(rest.begin() + idx, rest.end()); |
| 472 | rest = std::string(rest.begin(), rest.begin() + idx); |
| 473 | } |
| 474 | } |
| 475 | |
| 476 | if (!rest.empty() && rest[0] != '/') { |
| 477 | if (!u.Scheme.empty()) { |
| 478 | // We consider rootless paths per RFC 3986 as opaque. |
| 479 | u.Opaque = rest; |
| 480 | return u; |
| 481 | } |
| 482 | } |
| 483 | |
| 484 | if (!u.Scheme.empty() || (rest.find("///") != 0 && rest.find("//") == 0)) { |
| 485 | auto authority = std::string(rest.begin() + 2, rest.end()); |
| 486 | rest = ""; |
| 487 | int i = authority.find("/"); |
| 488 | if (i != std::string::npos) { |
| 489 | rest = std::string(authority.begin() + i, authority.end()); |
| 490 | authority = std::string(authority.begin(), authority.begin() + i); |
| 491 | } |
| 492 | parseAuthority(authority, u.User, u.Host); |
| 493 | } |
| 494 | |
| 495 | u.setPath(rest); |
| 496 | |
| 497 | return u; |
| 498 | } |
| 499 | |
| 500 | void URL::setFragment(const std::string &f) { |
| 501 | Fragment = unescape(f, encodeFragment); |
| 502 | auto escf = escape(Fragment, encodeFragment); |
| 503 | RawFragment = (escf == f) ? "" : f; |
| 504 | } |
| 505 | |
| 506 | void URL::setPath(const std::string &p) { |
| 507 | Path = unescape(p, encodePath); |
| 508 | auto escp = escape(Path, encodePath); |
| 509 | RawPath = (escp == p) ? "" : p; |
| 510 | } |
| 511 | |
| 512 | std::string URL::PathEscape(const std::string &path) { return escape(path, encodePath); } |
| 513 | std::string URL::PathUnescape(const std::string &path) { return unescape(path, encodePath); } |
| 514 | std::string URL::QueryEscape(const std::string &query) { return escape(query, encodeQueryComponent); } |
| 515 | std::string URL::QueryUnescape(const std::string &query) { return unescape(query, encodeQueryComponent); } |
url.h
· 1.4 KiB · C
Eredeti
#pragma once
#include <algorithm>
#include <map>
#include <stdexcept>
#include "common/common.h"
// Based on Golang Implementation
// MIT
namespace ProtoRock {
namespace Http {
struct UserInfo {
std::string Username;
std::string Password;
UserInfo() {}
UserInfo(const std::string &u) : Username(u){};
};
struct URL {
private:
void setFragment(const std::string &);
void setPath(const std::string &);
public:
std::string Scheme;
// encoded opaque data
std::string Opaque;
// username and password information
UserInfo User;
// host or host:port
std::string Host;
// path (relative paths may omit leading slash)
std::string Path;
// encoded path hint (see EscapedPath method)
std::string RawPath;
// append a query ('?') even if RawQuery is empty
bool ForceQuery = false;
// encoded query values, without '?'
std::string RawQuery;
// fragment for references, without '#'
std::string Fragment;
// encoded fragment hint (see EscapedFragment method)
std::string RawFragment;
static URL Parse(std::string url);
static std::string PathEscape(const std::string &path);
static std::string PathUnescape(const std::string &path);
static std::string QueryEscape(const std::string &query);
static std::string QueryUnescape(const std::string &query);
};
} // namespace Http
} // namespace ProtoRock
| 1 | #pragma once |
| 2 | |
| 3 | #include <algorithm> |
| 4 | #include <map> |
| 5 | #include <stdexcept> |
| 6 | |
| 7 | #include "common/common.h" |
| 8 | |
| 9 | // Based on Golang Implementation |
| 10 | // MIT |
| 11 | |
| 12 | namespace ProtoRock { |
| 13 | namespace Http { |
| 14 | struct UserInfo { |
| 15 | std::string Username; |
| 16 | std::string Password; |
| 17 | |
| 18 | UserInfo() {} |
| 19 | UserInfo(const std::string &u) : Username(u){}; |
| 20 | }; |
| 21 | |
| 22 | struct URL { |
| 23 | private: |
| 24 | void setFragment(const std::string &); |
| 25 | void setPath(const std::string &); |
| 26 | |
| 27 | public: |
| 28 | std::string Scheme; |
| 29 | // encoded opaque data |
| 30 | std::string Opaque; |
| 31 | // username and password information |
| 32 | UserInfo User; |
| 33 | // host or host:port |
| 34 | std::string Host; |
| 35 | // path (relative paths may omit leading slash) |
| 36 | std::string Path; |
| 37 | // encoded path hint (see EscapedPath method) |
| 38 | std::string RawPath; |
| 39 | // append a query ('?') even if RawQuery is empty |
| 40 | bool ForceQuery = false; |
| 41 | // encoded query values, without '?' |
| 42 | std::string RawQuery; |
| 43 | // fragment for references, without '#' |
| 44 | std::string Fragment; |
| 45 | // encoded fragment hint (see EscapedFragment method) |
| 46 | std::string RawFragment; |
| 47 | |
| 48 | static URL Parse(std::string url); |
| 49 | static std::string PathEscape(const std::string &path); |
| 50 | static std::string PathUnescape(const std::string &path); |
| 51 | static std::string QueryEscape(const std::string &query); |
| 52 | static std::string QueryUnescape(const std::string &query); |
| 53 | }; |
| 54 | |
| 55 | } // namespace Http |
| 56 | } // namespace ProtoRock |