url.cpp - Opengist

Revisão bb021845b6cc9c6c27c8e19cb501380a7be049d9

url.cpp · 17 KiB · C++ Bruto

#include "common/url.h" #include <fmt/format.h> #include <cctype> using namespace ProtoRock::Http; enum EncodingMode { encodePath = 1, encodePathSegment, encodeHost, encodeZone, encodeUserPassword, encodeQueryComponent, encodeFragment, }; char unhex(char c) { if ('0' <= c && c <= '9') { return c - '0'; } if ('a' <= c && c <= 'f') { return c - 'a' + 10; } if ('A' <= c && c <= 'F') { return c - 'A' + 10; } return 0; } const char *upperhex = "0123456789ABCDEF"; // Return true if the specified character should be escaped when // appearing in a URL string, according to RFC 3986. // // Please be informed that for now shouldEscape does not check all // reserved characters correctly. See golang.org/issue/5684. bool shouldEscape(char c, EncodingMode mode) { // §2.3 Unreserved characters (alphanum) if ('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9') { return false; } if (mode == encodeHost || mode == encodeZone) { // §3.2.2 Host allows // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" // as part of reg-name. // We add : because we include :port as part of host. // We add [ ] because we include [ipv6]:port as part of host. // We add < > because they're the only characters left that // we could possibly allow, and Parse will reject them if we // escape them (because hosts can't use %-encoding for // ASCII bytes). switch (c) { case '!': case '$': case '&': case '\'': case '(': case ')': case '*': case '+': case ',': case ';': case '=': case ':': case '[': case ']': case '<': case '>': case '"': return false; } } switch (c) { // §2.3 Unreserved characters (mark) case '-': case '_': case '.': case '~': return false; // §2.2 Reserved characters (reserved) case '$': case '&': case '+': case ',': case '/': case ':': case ';': case '=': case '?': case '@': // Different sections of the URL allow a few of // the reserved characters to appear unescaped. switch (mode) { case encodePath: // §3.3 // The RFC allows : @ & = + $ but saves / ; , for assigning // meaning to individual path segments. This package // only manipulates the path as a whole, so we allow those // last three as well. That leaves only ? to escape. return c == '?'; case encodePathSegment: // §3.3 // The RFC allows : @ & = + $ but saves / ; , for assigning // meaning to individual path segments. return c == '/' || c == ';' || c == ',' || c == '?'; case encodeUserPassword: // §3.2.1 // The RFC allows ';', ':', '&', '=', '+', '$', and ',' in // userinfo, so we must escape only '@', '/', and '?'. // The parsing of userinfo treats ':' as special so we must escape // that too. return c == '@' || c == '/' || c == '?' || c == ':'; case encodeQueryComponent: // §3.4 // The RFC reserves (so we must escape) everything. return true; case encodeFragment: // §4.1 // The RFC text is silent but the grammar allows // everything: case so escape nothing. return false; } } if (mode == encodeFragment) { // RFC 3986 §2.2 allows not escaping sub-delims. A subset of sub-delims are // included in reserved from RFC 2396 §2.2. The remaining sub-delims do not // need to be escaped. To minimize potential breakage, we apply two restrictions: // (1) we always escape sub-delims outside of the fragment, and (2) we always // escape single quote to avoid breaking callers that had previously assumed that // single quotes would be escaped. See issue #19917. switch (c) { case '!': case '(': case ')': case '*': return false; } } return true; } std::string escape(const std::string &s, EncodingMode mode) { auto spaceCount = 0; auto hexCount = 0; for (auto i = 0; i < s.size(); i++) { auto c = s[i]; if (shouldEscape(c, mode)) { if (c == ' ' && mode == encodeQueryComponent) { spaceCount++; } else { hexCount++; } } } if (spaceCount == 0 && hexCount == 0) { return s; } auto required = s.size() + 2 * hexCount; auto t = std::vector<char>(); t.reserve(required); if (hexCount == 0) { t.insert(t.begin(), s.begin(), s.end()); for (auto i = 0; i < s.size(); i++) { if (s[i] == ' ') { t[i] = '+'; } } return std::string(t.begin(), t.end()); } auto j = 0; auto c = 0; for (auto i = 0; i < s.size(); i++) { auto c = s[i]; if (c == ' ' && mode == encodeQueryComponent) { t[j] = '+'; j++; } else if (shouldEscape(c, mode)) { t[j] = '%'; t[j + 1] = upperhex[c >> 4]; t[j + 2] = upperhex[c & 15]; j += 3; } else { t[j] = s[i]; j++; } } return std::string(t.begin(), t.end()); } std::string unescape(std::string s, EncodingMode mode) { // Count %, check that they're well-formed. auto n = 0; auto hasPlus = false; auto tmp = std::string(); auto v = 0; for (int i = 0; i < s.size();) { switch (s[i]) { case '%': n++; if (i + 2 >= s.size() || !std::isxdigit(s[i + 1]) || !std::isxdigit(s[i + 2])) { s = std::string(s.begin() + 1, s.end()); if (s.size() > 3) { s = std::string(s.begin(), s.begin() + 3); } throw std::invalid_argument("escape error: " + s); } // Per https://tools.ietf.org/html/rfc3986#page-21 // in the host component %-encoding can only be used // for non-ASCII bytes. // But https://tools.ietf.org/html/rfc6874#section-2 // introduces %25 being allowed to escape a percent sign // in IPv6 scoped-address literals. Yay. tmp = std::string(s.begin() + i, s.begin() + i + 3); if (mode == encodeHost && unhex(s[i + 1]) < 8 && tmp != "%25") { throw std::invalid_argument("escape error: " + tmp); } if (mode == encodeZone) { // RFC 6874 says basically "anything goes" for zone identifiers // and that even non-ASCII can be redundantly escaped, // but it seems prudent to restrict %-escaped bytes here to those // that are valid host name bytes in their unescaped form. // That is, you can use escaping in the zone identifier but not // to introduce bytes you couldn't just write directly. // But Windows puts spaces here! Yay. v = unhex(s[i + 1]) << 4 | unhex(s[i + 2]); tmp = std::string(s.begin() + i, s.begin() + i + 3); if (tmp != "%25" && v != ' ' && shouldEscape(v, encodeHost)) { throw std::invalid_argument("escape error: " + tmp); } } i += 3; break; case '+': hasPlus = mode == encodeQueryComponent; i++; break; default: if ((mode == encodeHost || mode == encodeZone) && (uint8_t)s[i] < 0x80 && shouldEscape(s[i], mode)) { tmp = std::string(s.begin() + i, s.begin() + i + 1); throw std::invalid_argument("invalid host: " + tmp); } i++; } } if (n == 0 && !hasPlus) { return s; } auto ss = std::stringstream(); for (int i = 0; i < s.size(); i++) { switch (s[i]) { case '%': ss << (char)(unhex(s[i + 1]) << 4 | unhex(s[i + 2])); i += 2; break; case '+': ss << ((mode == encodeQueryComponent) ? ' ' : '+'); break; default: ss << s[i]; } } return ss.str(); } bool stringContainsCTLByte(const std::string &s) { for (auto c : s) { if (c < ' ' || c == 0x7f) { return true; } } return false; } std::string getScheme(std::string &url) { int i = 0; std::string scheme; for (auto &c : url) { if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z')) { // Do nothing } else if (('0' <= c && c <= '9') || c == '+' || c == '-' || c == '.') { if (i == 0) { break; } } else if (c == ':') { scheme = std::string(url.begin(), url.begin() + i); url = std::string(url.begin() + i + 1, url.end()); break; } else { // we have encountered an invalid character, // so there is no valid scheme break; } i++; } return scheme; } // validUserinfo reports whether s is a valid userinfo string per RFC 3986 // Section 3.2.1: // userinfo = *( unreserved / pct-encoded / sub-delims / ":" ) // unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" // / "*" / "+" / "," / ";" / "=" // // It doesn't validate pct-encoded. The caller does that via func unescape. bool validUserinfo(const std::string &s) { for (auto r : s) { if ('A' <= r && r <= 'Z') { continue; } if ('a' <= r && r <= 'z') { continue; } if ('0' <= r && r <= '9') { continue; } switch (r) { case '-': case '.': case '_': case ':': case '~': case '!': case '$': case '&': case '\'': case '(': case ')': case '*': case '+': case ',': case ';': case '=': case '%': case '@': continue; default: return false; } } return true; } // validOptionalPort reports whether port is either an empty string // or matches /^:\d*$/ bool validOptionalPort(const std::string &port) { if (port.empty()) { return true; } if (port[0] != ':') { return false; } for (auto b = port.begin() + 1; b < port.end(); b++) { if (*b < '0' || *b > '9') { return false; } } return true; } // parseHost parses host as an authority without user // information. That is, as host[:port]. std::string parseHost(const std::string &host) { int idx; if (!host.empty() && host[0] == '[') { // Parse an IP-Literal in RFC 3986 and RFC 6874. // E.g., "[fe80::1]", "[fe80::1%25en0]", "[fe80::1]:80". idx = host.find_last_of(']'); if (idx >= host.size()) { throw std::invalid_argument("cannot find ']' in host"); } auto colonPort = std::string(host.begin() + idx + 1, host.end()); if (!validOptionalPort(colonPort)) { throw std::invalid_argument(fmt::format("invalid port {} after host", colonPort)); } // RFC 6874 defines that %25 (%-encoded percent) introduces // the zone identifier, and the zone identifier can use basically // any %-encoding it likes. That's different from the host, which // can only %-encode non-ASCII bytes. // We do impose some restrictions on the zone, to avoid stupidity // like newlines. auto zone = host.find("%25"); if (idx != std::string::npos) { auto host1 = unescape(std::string(host.begin(), host.begin() + zone), encodeHost); auto host2 = unescape(std::string(host.begin() + zone, host.begin() + idx), encodeHost); auto host3 = unescape(std::string(host.begin() + idx, host.end()), encodeZone); return host1 + host2 + host3; } } else if ((idx = host.find_last_of(':')) < host.size()) { auto colonPort = std::string(host.begin() + idx, host.end()); if (!validOptionalPort(colonPort)) { throw std::invalid_argument(fmt::format("invalid port {} after host", colonPort)); } } return unescape(host, encodeHost); } void parseAuthority(const std::string &authority, UserInfo &ui, std::string &host) { auto i = authority.find_last_of('@'); if (i > authority.size()) { host = parseHost(authority); } else { host = parseHost(std::string(authority.begin() + i + 1, authority.end())); } if (i > authority.size()) { return; } auto userInfo = std::string(authority.begin(), authority.begin() + i); if (!validUserinfo(userInfo)) { throw std::invalid_argument("invalid userinfo"); } auto idx = userInfo.find(':'); if (idx == std::string::npos) { userInfo = unescape(userInfo, encodeUserPassword); ui = UserInfo(userInfo); } else { auto username = std::string(userInfo.begin(), userInfo.begin() + idx); auto password = std::string(userInfo.begin() + idx, userInfo.end()); ui.Username = unescape(username, encodeUserPassword); ui.Password = unescape(username, encodeUserPassword); } } URL URL::Parse(std::string url) { URL u; std::string frag; auto hashIndex = url.find("#"); if (hashIndex != std::string::npos) { frag = std::string(url.begin() + hashIndex, url.end()); url = std::string(url.begin(), url.begin() + hashIndex); } u.setFragment(frag); if (stringContainsCTLByte(url)) { throw std::invalid_argument("invalid url: string contains control bytes"); } if (url == "*") { u.Path = "*"; return u; } auto rest = url; u.Scheme = getScheme(rest); std::transform(u.Scheme.begin(), u.Scheme.end(), u.Scheme.begin(), [](unsigned char c) -> unsigned char { return std::tolower(c); }); if (!rest.empty() > 0 && rest[rest.size() - 1] == '?') { u.ForceQuery = true; rest.pop_back(); } else { auto idx = rest.find("?"); if (idx != std::string::npos) { u.RawQuery = std::string(rest.begin() + idx, rest.end()); rest = std::string(rest.begin(), rest.begin() + idx); } } if (!rest.empty() && rest[0] != '/') { if (!u.Scheme.empty()) { // We consider rootless paths per RFC 3986 as opaque. u.Opaque = rest; return u; } } if (!u.Scheme.empty() || (rest.find("///") != 0 && rest.find("//") == 0)) { auto authority = std::string(rest.begin() + 2, rest.end()); rest = ""; int i = authority.find("/"); if (i != std::string::npos) { rest = std::string(authority.begin() + i, authority.end()); authority = std::string(authority.begin(), authority.begin() + i); } parseAuthority(authority, u.User, u.Host); } u.setPath(rest); return u; } void URL::setFragment(const std::string &f) { Fragment = unescape(f, encodeFragment); auto escf = escape(Fragment, encodeFragment); RawFragment = (escf == f) ? "" : f; } void URL::setPath(const std::string &p) { Path = unescape(p, encodePath); auto escp = escape(Path, encodePath); RawPath = (escp == p) ? "" : p; } std::string URL::PathEscape(const std::string &path) { return escape(path, encodePath); } std::string URL::PathUnescape(const std::string &path) { return unescape(path, encodePath); } std::string URL::QueryEscape(const std::string &query) { return escape(query, encodeQueryComponent); } std::string URL::QueryUnescape(const std::string &query) { return unescape(query, encodeQueryComponent); }

1	#include "common/url.h"
2
3	#include <fmt/format.h>
4
5	#include <cctype>
6
7	using namespace ProtoRock::Http;
8
9	enum EncodingMode {
10	encodePath = 1,
11	encodePathSegment,
12	encodeHost,
13	encodeZone,
14	encodeUserPassword,
15	encodeQueryComponent,
16	encodeFragment,
17	};
18
19	char unhex(char c) {
20	if ('0' <= c && c <= '9') {
21	return c - '0';
22	}
23	if ('a' <= c && c <= 'f') {
24	return c - 'a' + 10;
25	}
26	if ('A' <= c && c <= 'F') {
27	return c - 'A' + 10;
28	}
29	return 0;
30	}
31
32	const char *upperhex = "0123456789ABCDEF";
33
34	// Return true if the specified character should be escaped when
35	// appearing in a URL string, according to RFC 3986.
36	//
37	// Please be informed that for now shouldEscape does not check all
38	// reserved characters correctly. See golang.org/issue/5684.
39	bool shouldEscape(char c, EncodingMode mode) {
40	// §2.3 Unreserved characters (alphanum)
41	if ('a' <= c && c <= 'z' \|\| 'A' <= c && c <= 'Z' \|\| '0' <= c && c <= '9') {
42	return false;
43	}
44	if (mode == encodeHost \|\| mode == encodeZone) {
45	// §3.2.2 Host allows
46	// sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
47	// as part of reg-name.
48	// We add : because we include :port as part of host.
49	// We add [ ] because we include [ipv6]:port as part of host.
50	// We add < > because they're the only characters left that
51	// we could possibly allow, and Parse will reject them if we
52	// escape them (because hosts can't use %-encoding for
53	// ASCII bytes).
54	switch (c) {
55	case '!':
56	case '$':
57	case '&':
58	case '\'':
59	case '(':
60	case ')':
61	case '*':
62	case '+':
63	case ',':
64	case ';':
65	case '=':
66	case ':':
67	case '[':
68	case ']':
69	case '<':
70	case '>':
71	case '"':
72	return false;
73	}
74	}
75
76	switch (c) {
77	// §2.3 Unreserved characters (mark)
78	case '-':
79	case '_':
80	case '.':
81	case '~':
82	return false;
83
84	// §2.2 Reserved characters (reserved)
85	case '$':
86	case '&':
87	case '+':
88	case ',':
89	case '/':
90	case ':':
91	case ';':
92	case '=':
93	case '?':
94	case '@':
95	// Different sections of the URL allow a few of
96	// the reserved characters to appear unescaped.
97	switch (mode) {
98	case encodePath: // §3.3
99	// The RFC allows : @ & = + $ but saves / ; , for assigning
100	// meaning to individual path segments. This package
101	// only manipulates the path as a whole, so we allow those
102	// last three as well. That leaves only ? to escape.
103	return c == '?';
104
105	case encodePathSegment: // §3.3
106	// The RFC allows : @ & = + $ but saves / ; , for assigning
107	// meaning to individual path segments.
108	return c == '/' \|\| c == ';' \|\| c == ',' \|\| c == '?';
109
110	case encodeUserPassword: // §3.2.1
111	// The RFC allows ';', ':', '&', '=', '+', '$', and ',' in
112	// userinfo, so we must escape only '@', '/', and '?'.
113	// The parsing of userinfo treats ':' as special so we must escape
114	// that too.
115	return c == '@' \|\| c == '/' \|\| c == '?' \|\| c == ':';
116
117	case encodeQueryComponent: // §3.4
118	// The RFC reserves (so we must escape) everything.
119	return true;
120
121	case encodeFragment: // §4.1
122	// The RFC text is silent but the grammar allows
123	// everything: case so escape nothing.
124	return false;
125	}
126	}
127
128	if (mode == encodeFragment) {
129	// RFC 3986 §2.2 allows not escaping sub-delims. A subset of sub-delims are
130	// included in reserved from RFC 2396 §2.2. The remaining sub-delims do not
131	// need to be escaped. To minimize potential breakage, we apply two restrictions:
132	// (1) we always escape sub-delims outside of the fragment, and (2) we always
133	// escape single quote to avoid breaking callers that had previously assumed that
134	// single quotes would be escaped. See issue #19917.
135	switch (c) {
136	case '!':
137	case '(':
138	case ')':
139	case '*':
140	return false;
141	}
142	}
143
144	return true;
145	}
146
147	std::string escape(const std::string &s, EncodingMode mode) {
148	auto spaceCount = 0;
149	auto hexCount = 0;
150
151	for (auto i = 0; i < s.size(); i++) {
152	auto c = s[i];
153	if (shouldEscape(c, mode)) {
154	if (c == ' ' && mode == encodeQueryComponent) {
155	spaceCount++;
156	} else {
157	hexCount++;
158	}
159	}
160	}
161
162	if (spaceCount == 0 && hexCount == 0) {
163	return s;
164	}
165
166	auto required = s.size() + 2 * hexCount;
167	auto t = std::vector<char>();
168	t.reserve(required);
169
170	if (hexCount == 0) {
171	t.insert(t.begin(), s.begin(), s.end());
172	for (auto i = 0; i < s.size(); i++) {
173	if (s[i] == ' ') {
174	t[i] = '+';
175	}
176	}
177	return std::string(t.begin(), t.end());
178	}
179
180	auto j = 0;
181	auto c = 0;
182	for (auto i = 0; i < s.size(); i++) {
183	auto c = s[i];
184	if (c == ' ' && mode == encodeQueryComponent) {
185	t[j] = '+';
186	j++;
187	} else if (shouldEscape(c, mode)) {
188	t[j] = '%';
189	t[j + 1] = upperhex[c >> 4];
190	t[j + 2] = upperhex[c & 15];
191	j += 3;
192	} else {
193	t[j] = s[i];
194	j++;
195	}
196	}
197
198	return std::string(t.begin(), t.end());
199	}
200
201	std::string unescape(std::string s, EncodingMode mode) {
202	// Count %, check that they're well-formed.
203	auto n = 0;
204	auto hasPlus = false;
205	auto tmp = std::string();
206	auto v = 0;
207	for (int i = 0; i < s.size();) {
208	switch (s[i]) {
209	case '%':
210	n++;
211	if (i + 2 >= s.size() \|\| !std::isxdigit(s[i + 1]) \|\| !std::isxdigit(s[i + 2])) {
212	s = std::string(s.begin() + 1, s.end());
213	if (s.size() > 3) {
214	s = std::string(s.begin(), s.begin() + 3);
215	}
216	throw std::invalid_argument("escape error: " + s);
217	}
218	// Per https://tools.ietf.org/html/rfc3986#page-21
219	// in the host component %-encoding can only be used
220	// for non-ASCII bytes.
221	// But https://tools.ietf.org/html/rfc6874#section-2
222	// introduces %25 being allowed to escape a percent sign
223	// in IPv6 scoped-address literals. Yay.
224	tmp = std::string(s.begin() + i, s.begin() + i + 3);
225	if (mode == encodeHost && unhex(s[i + 1]) < 8 && tmp != "%25") {
226	throw std::invalid_argument("escape error: " + tmp);
227	}
228
229	if (mode == encodeZone) {
230	// RFC 6874 says basically "anything goes" for zone identifiers
231	// and that even non-ASCII can be redundantly escaped,
232	// but it seems prudent to restrict %-escaped bytes here to those
233	// that are valid host name bytes in their unescaped form.
234	// That is, you can use escaping in the zone identifier but not
235	// to introduce bytes you couldn't just write directly.
236	// But Windows puts spaces here! Yay.
237	v = unhex(s[i + 1]) << 4 \| unhex(s[i + 2]);
238	tmp = std::string(s.begin() + i, s.begin() + i + 3);
239	if (tmp != "%25" && v != ' ' && shouldEscape(v, encodeHost)) {
240	throw std::invalid_argument("escape error: " + tmp);
241	}
242	}
243	i += 3;
244	break;
245	case '+':
246	hasPlus = mode == encodeQueryComponent;
247	i++;
248	break;
249	default:
250	if ((mode == encodeHost \|\| mode == encodeZone) && (uint8_t)s[i] < 0x80 && shouldEscape(s[i], mode)) {
251	tmp = std::string(s.begin() + i, s.begin() + i + 1);
252	throw std::invalid_argument("invalid host: " + tmp);
253	}
254	i++;
255	}
256	}
257
258	if (n == 0 && !hasPlus) {
259	return s;
260	}
261
262	auto ss = std::stringstream();
263	for (int i = 0; i < s.size(); i++) {
264	switch (s[i]) {
265	case '%':
266	ss << (char)(unhex(s[i + 1]) << 4 \| unhex(s[i + 2]));
267	i += 2;
268	break;
269	case '+':
270	ss << ((mode == encodeQueryComponent) ? ' ' : '+');
271	break;
272	default:
273	ss << s[i];
274	}
275	}
276	return ss.str();
277	}
278
279	bool stringContainsCTLByte(const std::string &s) {
280	for (auto c : s) {
281	if (c < ' ' \|\| c == 0x7f) {
282	return true;
283	}
284	}
285	return false;
286	}
287
288	std::string getScheme(std::string &url) {
289	int i = 0;
290	std::string scheme;
291	for (auto &c : url) {
292	if (('a' <= c && c <= 'z') \|\| ('A' <= c && c <= 'Z')) {
293	// Do nothing
294	} else if (('0' <= c && c <= '9') \|\| c == '+' \|\| c == '-' \|\| c == '.') {
295	if (i == 0) {
296	break;
297	}
298	} else if (c == ':') {
299	scheme = std::string(url.begin(), url.begin() + i);
300	url = std::string(url.begin() + i + 1, url.end());
301	break;
302	} else {
303	// we have encountered an invalid character,
304	// so there is no valid scheme
305	break;
306	}
307	i++;
308	}
309	return scheme;
310	}
311
312	// validUserinfo reports whether s is a valid userinfo string per RFC 3986
313	// Section 3.2.1:
314	// userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
315	// unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
316	// sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
317	// / "*" / "+" / "," / ";" / "="
318	//
319	// It doesn't validate pct-encoded. The caller does that via func unescape.
320	bool validUserinfo(const std::string &s) {
321	for (auto r : s) {
322	if ('A' <= r && r <= 'Z') {
323	continue;
324	}
325	if ('a' <= r && r <= 'z') {
326	continue;
327	}
328	if ('0' <= r && r <= '9') {
329	continue;
330	}
331	switch (r) {
332	case '-':
333	case '.':
334	case '_':
335	case ':':
336	case '~':
337	case '!':
338	case '$':
339	case '&':
340	case '\'':
341	case '(':
342	case ')':
343	case '*':
344	case '+':
345	case ',':
346	case ';':
347	case '=':
348	case '%':
349	case '@':
350	continue;
351	default:
352	return false;
353	}
354	}
355	return true;
356	}
357
358	// validOptionalPort reports whether port is either an empty string
359	// or matches /^:\d*$/
360	bool validOptionalPort(const std::string &port) {
361	if (port.empty()) {
362	return true;
363	}
364	if (port[0] != ':') {
365	return false;
366	}
367	for (auto b = port.begin() + 1; b < port.end(); b++) {
368	if (b < '0' \|\| b > '9') {
369	return false;
370	}
371	}
372	return true;
373	}
374
375	// parseHost parses host as an authority without user
376	// information. That is, as host[:port].
377	std::string parseHost(const std::string &host) {
378	int idx;
379	if (!host.empty() && host[0] == '[') {
380	// Parse an IP-Literal in RFC 3986 and RFC 6874.
381	// E.g., "[fe80::1]", "[fe80::1%25en0]", "[fe80::1]:80".
382	idx = host.find_last_of(']');
383	if (idx >= host.size()) {
384	throw std::invalid_argument("cannot find ']' in host");
385	}
386	auto colonPort = std::string(host.begin() + idx + 1, host.end());
387	if (!validOptionalPort(colonPort)) {
388	throw std::invalid_argument(fmt::format("invalid port {} after host", colonPort));
389	}
390	// RFC 6874 defines that %25 (%-encoded percent) introduces
391	// the zone identifier, and the zone identifier can use basically
392	// any %-encoding it likes. That's different from the host, which
393	// can only %-encode non-ASCII bytes.
394	// We do impose some restrictions on the zone, to avoid stupidity
395	// like newlines.
396	auto zone = host.find("%25");
397	if (idx != std::string::npos) {
398	auto host1 = unescape(std::string(host.begin(), host.begin() + zone), encodeHost);
399	auto host2 = unescape(std::string(host.begin() + zone, host.begin() + idx), encodeHost);
400	auto host3 = unescape(std::string(host.begin() + idx, host.end()), encodeZone);
401	return host1 + host2 + host3;
402	}
403	} else if ((idx = host.find_last_of(':')) < host.size()) {
404	auto colonPort = std::string(host.begin() + idx, host.end());
405	if (!validOptionalPort(colonPort)) {
406	throw std::invalid_argument(fmt::format("invalid port {} after host", colonPort));
407	}
408	}
409
410	return unescape(host, encodeHost);
411	}
412
413	void parseAuthority(const std::string &authority, UserInfo &ui, std::string &host) {
414	auto i = authority.find_last_of('@');
415	if (i > authority.size()) {
416	host = parseHost(authority);
417	} else {
418	host = parseHost(std::string(authority.begin() + i + 1, authority.end()));
419	}
420
421	if (i > authority.size()) {
422	return;
423	}
424
425	auto userInfo = std::string(authority.begin(), authority.begin() + i);
426	if (!validUserinfo(userInfo)) {
427	throw std::invalid_argument("invalid userinfo");
428	}
429	auto idx = userInfo.find(':');
430	if (idx == std::string::npos) {
431	userInfo = unescape(userInfo, encodeUserPassword);
432	ui = UserInfo(userInfo);
433	} else {
434	auto username = std::string(userInfo.begin(), userInfo.begin() + idx);
435	auto password = std::string(userInfo.begin() + idx, userInfo.end());
436	ui.Username = unescape(username, encodeUserPassword);
437	ui.Password = unescape(username, encodeUserPassword);
438	}
439	}
440
441	URL URL::Parse(std::string url) {
442	URL u;
443	std::string frag;
444	auto hashIndex = url.find("#");
445	if (hashIndex != std::string::npos) {
446	frag = std::string(url.begin() + hashIndex, url.end());
447	url = std::string(url.begin(), url.begin() + hashIndex);
448	}
449
450	u.setFragment(frag);
451
452	if (stringContainsCTLByte(url)) {
453	throw std::invalid_argument("invalid url: string contains control bytes");
454	}
455
456	if (url == "*") {
457	u.Path = "*";
458	return u;
459	}
460	auto rest = url;
461
462	u.Scheme = getScheme(rest);
463	std::transform(u.Scheme.begin(), u.Scheme.end(), u.Scheme.begin(), [](unsigned char c) -> unsigned char { return std::tolower(c); });
464
465	if (!rest.empty() > 0 && rest[rest.size() - 1] == '?') {
466	u.ForceQuery = true;
467	rest.pop_back();
468	} else {
469	auto idx = rest.find("?");
470	if (idx != std::string::npos) {
471	u.RawQuery = std::string(rest.begin() + idx, rest.end());
472	rest = std::string(rest.begin(), rest.begin() + idx);
473	}
474	}
475
476	if (!rest.empty() && rest[0] != '/') {
477	if (!u.Scheme.empty()) {
478	// We consider rootless paths per RFC 3986 as opaque.
479	u.Opaque = rest;
480	return u;
481	}
482	}
483
484	if (!u.Scheme.empty() \|\| (rest.find("///") != 0 && rest.find("//") == 0)) {
485	auto authority = std::string(rest.begin() + 2, rest.end());
486	rest = "";
487	int i = authority.find("/");
488	if (i != std::string::npos) {
489	rest = std::string(authority.begin() + i, authority.end());
490	authority = std::string(authority.begin(), authority.begin() + i);
491	}
492	parseAuthority(authority, u.User, u.Host);
493	}
494
495	u.setPath(rest);
496
497	return u;
498	}
499
500	void URL::setFragment(const std::string &f) {
501	Fragment = unescape(f, encodeFragment);
502	auto escf = escape(Fragment, encodeFragment);
503	RawFragment = (escf == f) ? "" : f;
504	}
505
506	void URL::setPath(const std::string &p) {
507	Path = unescape(p, encodePath);
508	auto escp = escape(Path, encodePath);
509	RawPath = (escp == p) ? "" : p;
510	}
511
512	std::string URL::PathEscape(const std::string &path) { return escape(path, encodePath); }
513	std::string URL::PathUnescape(const std::string &path) { return unescape(path, encodePath); }
514	std::string URL::QueryEscape(const std::string &query) { return escape(query, encodeQueryComponent); }
515	std::string URL::QueryUnescape(const std::string &query) { return unescape(query, encodeQueryComponent); }

url.h · 1.4 KiB · C Bruto

1	#pragma once
2
3	#include <algorithm>
4	#include <map>
5	#include <stdexcept>
6
7	#include "common/common.h"
8
9	// Based on Golang Implementation
10	// MIT
11
12	namespace ProtoRock {
13	namespace Http {
14	struct UserInfo {
15	std::string Username;
16	std::string Password;
17
18	UserInfo() {}
19	UserInfo(const std::string &u) : Username(u){};
20	};
21
22	struct URL {
23	private:
24	void setFragment(const std::string &);
25	void setPath(const std::string &);
26
27	public:
28	std::string Scheme;
29	// encoded opaque data
30	std::string Opaque;
31	// username and password information
32	UserInfo User;
33	// host or host:port
34	std::string Host;
35	// path (relative paths may omit leading slash)
36	std::string Path;
37	// encoded path hint (see EscapedPath method)
38	std::string RawPath;
39	// append a query ('?') even if RawQuery is empty
40	bool ForceQuery = false;
41	// encoded query values, without '?'
42	std::string RawQuery;
43	// fragment for references, without '#'
44	std::string Fragment;
45	// encoded fragment hint (see EscapedFragment method)
46	std::string RawFragment;
47
48	static URL Parse(std::string url);
49	static std::string PathEscape(const std::string &path);
50	static std::string PathUnescape(const std::string &path);
51	static std::string QueryEscape(const std::string &query);
52	static std::string QueryUnescape(const std::string &query);
53	};
54
55	} // namespace Http
56	} // namespace ProtoRock