Revision of url.cpp

Baixar ZIP

Lucas Teske revisou este gist 3 years ago. Ir para a revisão

Sem alterações

Lucas Teske revisou este gist 3 years ago. Ir para a revisão

Sem alterações

Lucas Teske revisou este gist 3 years ago. Ir para a revisão

Sem alterações

Lucas Teske revisou este gist 3 years ago. Ir para a revisão

2 files changed, 571 insertions

url.cpp(arquivo criado)

		@@ -0,0 +1,515 @@
1	+	#include "common/url.h"
2	+
3	+	#include <fmt/format.h>
4	+
5	+	#include <cctype>
6	+
7	+	using namespace ProtoRock::Http;
8	+
9	+	enum EncodingMode {
10	+	encodePath = 1,
11	+	encodePathSegment,
12	+	encodeHost,
13	+	encodeZone,
14	+	encodeUserPassword,
15	+	encodeQueryComponent,
16	+	encodeFragment,
17	+	};
18	+
19	+	char unhex(char c) {
20	+	if ('0' <= c && c <= '9') {
21	+	return c - '0';
22	+	}
23	+	if ('a' <= c && c <= 'f') {
24	+	return c - 'a' + 10;
25	+	}
26	+	if ('A' <= c && c <= 'F') {
27	+	return c - 'A' + 10;
28	+	}
29	+	return 0;
30	+	}
31	+
32	+	const char *upperhex = "0123456789ABCDEF";
33	+
34	+	// Return true if the specified character should be escaped when
35	+	// appearing in a URL string, according to RFC 3986.
36	+	//
37	+	// Please be informed that for now shouldEscape does not check all
38	+	// reserved characters correctly. See golang.org/issue/5684.
39	+	bool shouldEscape(char c, EncodingMode mode) {
40	+	// §2.3 Unreserved characters (alphanum)
41	+	if ('a' <= c && c <= 'z' \|\| 'A' <= c && c <= 'Z' \|\| '0' <= c && c <= '9') {
42	+	return false;
43	+	}
44	+	if (mode == encodeHost \|\| mode == encodeZone) {
45	+	// §3.2.2 Host allows
46	+	// sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
47	+	// as part of reg-name.
48	+	// We add : because we include :port as part of host.
49	+	// We add [ ] because we include [ipv6]:port as part of host.
50	+	// We add < > because they're the only characters left that
51	+	// we could possibly allow, and Parse will reject them if we
52	+	// escape them (because hosts can't use %-encoding for
53	+	// ASCII bytes).
54	+	switch (c) {
55	+	case '!':
56	+	case '$':
57	+	case '&':
58	+	case '\'':
59	+	case '(':
60	+	case ')':
61	+	case '*':
62	+	case '+':
63	+	case ',':
64	+	case ';':
65	+	case '=':
66	+	case ':':
67	+	case '[':
68	+	case ']':
69	+	case '<':
70	+	case '>':
71	+	case '"':
72	+	return false;
73	+	}
74	+	}
75	+
76	+	switch (c) {
77	+	// §2.3 Unreserved characters (mark)
78	+	case '-':
79	+	case '_':
80	+	case '.':
81	+	case '~':
82	+	return false;
83	+
84	+	// §2.2 Reserved characters (reserved)
85	+	case '$':
86	+	case '&':
87	+	case '+':
88	+	case ',':
89	+	case '/':
90	+	case ':':
91	+	case ';':
92	+	case '=':
93	+	case '?':
94	+	case '@':
95	+	// Different sections of the URL allow a few of
96	+	// the reserved characters to appear unescaped.
97	+	switch (mode) {
98	+	case encodePath: // §3.3
99	+	// The RFC allows : @ & = + $ but saves / ; , for assigning
100	+	// meaning to individual path segments. This package
101	+	// only manipulates the path as a whole, so we allow those
102	+	// last three as well. That leaves only ? to escape.
103	+	return c == '?';
104	+
105	+	case encodePathSegment: // §3.3
106	+	// The RFC allows : @ & = + $ but saves / ; , for assigning
107	+	// meaning to individual path segments.
108	+	return c == '/' \|\| c == ';' \|\| c == ',' \|\| c == '?';
109	+
110	+	case encodeUserPassword: // §3.2.1
111	+	// The RFC allows ';', ':', '&', '=', '+', '$', and ',' in
112	+	// userinfo, so we must escape only '@', '/', and '?'.
113	+	// The parsing of userinfo treats ':' as special so we must escape
114	+	// that too.
115	+	return c == '@' \|\| c == '/' \|\| c == '?' \|\| c == ':';
116	+
117	+	case encodeQueryComponent: // §3.4
118	+	// The RFC reserves (so we must escape) everything.
119	+	return true;
120	+
121	+	case encodeFragment: // §4.1
122	+	// The RFC text is silent but the grammar allows
123	+	// everything: case so escape nothing.
124	+	return false;
125	+	}
126	+	}
127	+
128	+	if (mode == encodeFragment) {
129	+	// RFC 3986 §2.2 allows not escaping sub-delims. A subset of sub-delims are
130	+	// included in reserved from RFC 2396 §2.2. The remaining sub-delims do not
131	+	// need to be escaped. To minimize potential breakage, we apply two restrictions:
132	+	// (1) we always escape sub-delims outside of the fragment, and (2) we always
133	+	// escape single quote to avoid breaking callers that had previously assumed that
134	+	// single quotes would be escaped. See issue #19917.
135	+	switch (c) {
136	+	case '!':
137	+	case '(':
138	+	case ')':
139	+	case '*':
140	+	return false;
141	+	}
142	+	}
143	+
144	+	return true;
145	+	}
146	+
147	+	std::string escape(const std::string &s, EncodingMode mode) {
148	+	auto spaceCount = 0;
149	+	auto hexCount = 0;
150	+
151	+	for (auto i = 0; i < s.size(); i++) {
152	+	auto c = s[i];
153	+	if (shouldEscape(c, mode)) {
154	+	if (c == ' ' && mode == encodeQueryComponent) {
155	+	spaceCount++;
156	+	} else {
157	+	hexCount++;
158	+	}
159	+	}
160	+	}
161	+
162	+	if (spaceCount == 0 && hexCount == 0) {
163	+	return s;
164	+	}
165	+
166	+	auto required = s.size() + 2 * hexCount;
167	+	auto t = std::vector<char>();
168	+	t.reserve(required);
169	+
170	+	if (hexCount == 0) {
171	+	t.insert(t.begin(), s.begin(), s.end());
172	+	for (auto i = 0; i < s.size(); i++) {
173	+	if (s[i] == ' ') {
174	+	t[i] = '+';
175	+	}
176	+	}
177	+	return std::string(t.begin(), t.end());
178	+	}
179	+
180	+	auto j = 0;
181	+	auto c = 0;
182	+	for (auto i = 0; i < s.size(); i++) {
183	+	auto c = s[i];
184	+	if (c == ' ' && mode == encodeQueryComponent) {
185	+	t[j] = '+';
186	+	j++;
187	+	} else if (shouldEscape(c, mode)) {
188	+	t[j] = '%';
189	+	t[j + 1] = upperhex[c >> 4];
190	+	t[j + 2] = upperhex[c & 15];
191	+	j += 3;
192	+	} else {
193	+	t[j] = s[i];
194	+	j++;
195	+	}
196	+	}
197	+
198	+	return std::string(t.begin(), t.end());
199	+	}
200	+
201	+	std::string unescape(std::string s, EncodingMode mode) {
202	+	// Count %, check that they're well-formed.
203	+	auto n = 0;
204	+	auto hasPlus = false;
205	+	auto tmp = std::string();
206	+	auto v = 0;
207	+	for (int i = 0; i < s.size();) {
208	+	switch (s[i]) {
209	+	case '%':
210	+	n++;
211	+	if (i + 2 >= s.size() \|\| !std::isxdigit(s[i + 1]) \|\| !std::isxdigit(s[i + 2])) {
212	+	s = std::string(s.begin() + 1, s.end());
213	+	if (s.size() > 3) {
214	+	s = std::string(s.begin(), s.begin() + 3);
215	+	}
216	+	throw std::invalid_argument("escape error: " + s);
217	+	}
218	+	// Per https://tools.ietf.org/html/rfc3986#page-21
219	+	// in the host component %-encoding can only be used
220	+	// for non-ASCII bytes.
221	+	// But https://tools.ietf.org/html/rfc6874#section-2
222	+	// introduces %25 being allowed to escape a percent sign
223	+	// in IPv6 scoped-address literals. Yay.
224	+	tmp = std::string(s.begin() + i, s.begin() + i + 3);
225	+	if (mode == encodeHost && unhex(s[i + 1]) < 8 && tmp != "%25") {
226	+	throw std::invalid_argument("escape error: " + tmp);
227	+	}
228	+
229	+	if (mode == encodeZone) {
230	+	// RFC 6874 says basically "anything goes" for zone identifiers
231	+	// and that even non-ASCII can be redundantly escaped,
232	+	// but it seems prudent to restrict %-escaped bytes here to those
233	+	// that are valid host name bytes in their unescaped form.
234	+	// That is, you can use escaping in the zone identifier but not
235	+	// to introduce bytes you couldn't just write directly.
236	+	// But Windows puts spaces here! Yay.
237	+	v = unhex(s[i + 1]) << 4 \| unhex(s[i + 2]);
238	+	tmp = std::string(s.begin() + i, s.begin() + i + 3);
239	+	if (tmp != "%25" && v != ' ' && shouldEscape(v, encodeHost)) {
240	+	throw std::invalid_argument("escape error: " + tmp);
241	+	}
242	+	}
243	+	i += 3;
244	+	break;
245	+	case '+':
246	+	hasPlus = mode == encodeQueryComponent;
247	+	i++;
248	+	break;
249	+	default:
250	+	if ((mode == encodeHost \|\| mode == encodeZone) && (uint8_t)s[i] < 0x80 && shouldEscape(s[i], mode)) {
251	+	tmp = std::string(s.begin() + i, s.begin() + i + 1);
252	+	throw std::invalid_argument("invalid host: " + tmp);
253	+	}
254	+	i++;
255	+	}
256	+	}
257	+
258	+	if (n == 0 && !hasPlus) {
259	+	return s;
260	+	}
261	+
262	+	auto ss = std::stringstream();
263	+	for (int i = 0; i < s.size(); i++) {
264	+	switch (s[i]) {
265	+	case '%':
266	+	ss << (char)(unhex(s[i + 1]) << 4 \| unhex(s[i + 2]));
267	+	i += 2;
268	+	break;
269	+	case '+':
270	+	ss << ((mode == encodeQueryComponent) ? ' ' : '+');
271	+	break;
272	+	default:
273	+	ss << s[i];
274	+	}
275	+	}
276	+	return ss.str();
277	+	}
278	+
279	+	bool stringContainsCTLByte(const std::string &s) {
280	+	for (auto c : s) {
281	+	if (c < ' ' \|\| c == 0x7f) {
282	+	return true;
283	+	}
284	+	}
285	+	return false;
286	+	}
287	+
288	+	std::string getScheme(std::string &url) {
289	+	int i = 0;
290	+	std::string scheme;
291	+	for (auto &c : url) {
292	+	if (('a' <= c && c <= 'z') \|\| ('A' <= c && c <= 'Z')) {
293	+	// Do nothing
294	+	} else if (('0' <= c && c <= '9') \|\| c == '+' \|\| c == '-' \|\| c == '.') {
295	+	if (i == 0) {
296	+	break;
297	+	}
298	+	} else if (c == ':') {
299	+	scheme = std::string(url.begin(), url.begin() + i);
300	+	url = std::string(url.begin() + i + 1, url.end());
301	+	break;
302	+	} else {
303	+	// we have encountered an invalid character,
304	+	// so there is no valid scheme
305	+	break;
306	+	}
307	+	i++;
308	+	}
309	+	return scheme;
310	+	}
311	+
312	+	// validUserinfo reports whether s is a valid userinfo string per RFC 3986
313	+	// Section 3.2.1:
314	+	// userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
315	+	// unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
316	+	// sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
317	+	// / "*" / "+" / "," / ";" / "="
318	+	//
319	+	// It doesn't validate pct-encoded. The caller does that via func unescape.
320	+	bool validUserinfo(const std::string &s) {
321	+	for (auto r : s) {
322	+	if ('A' <= r && r <= 'Z') {
323	+	continue;
324	+	}
325	+	if ('a' <= r && r <= 'z') {
326	+	continue;
327	+	}
328	+	if ('0' <= r && r <= '9') {
329	+	continue;
330	+	}
331	+	switch (r) {
332	+	case '-':
333	+	case '.':
334	+	case '_':
335	+	case ':':
336	+	case '~':
337	+	case '!':
338	+	case '$':
339	+	case '&':
340	+	case '\'':
341	+	case '(':
342	+	case ')':
343	+	case '*':
344	+	case '+':
345	+	case ',':
346	+	case ';':
347	+	case '=':
348	+	case '%':
349	+	case '@':
350	+	continue;
351	+	default:
352	+	return false;
353	+	}
354	+	}
355	+	return true;
356	+	}
357	+
358	+	// validOptionalPort reports whether port is either an empty string
359	+	// or matches /^:\d*$/
360	+	bool validOptionalPort(const std::string &port) {
361	+	if (port.empty()) {
362	+	return true;
363	+	}
364	+	if (port[0] != ':') {
365	+	return false;
366	+	}
367	+	for (auto b = port.begin() + 1; b < port.end(); b++) {
368	+	if (b < '0' \|\| b > '9') {
369	+	return false;
370	+	}
371	+	}
372	+	return true;
373	+	}
374	+
375	+	// parseHost parses host as an authority without user
376	+	// information. That is, as host[:port].
377	+	std::string parseHost(const std::string &host) {
378	+	int idx;
379	+	if (!host.empty() && host[0] == '[') {
380	+	// Parse an IP-Literal in RFC 3986 and RFC 6874.
381	+	// E.g., "[fe80::1]", "[fe80::1%25en0]", "[fe80::1]:80".
382	+	idx = host.find_last_of(']');
383	+	if (idx >= host.size()) {
384	+	throw std::invalid_argument("cannot find ']' in host");
385	+	}
386	+	auto colonPort = std::string(host.begin() + idx + 1, host.end());
387	+	if (!validOptionalPort(colonPort)) {
388	+	throw std::invalid_argument(fmt::format("invalid port {} after host", colonPort));
389	+	}
390	+	// RFC 6874 defines that %25 (%-encoded percent) introduces
391	+	// the zone identifier, and the zone identifier can use basically
392	+	// any %-encoding it likes. That's different from the host, which
393	+	// can only %-encode non-ASCII bytes.
394	+	// We do impose some restrictions on the zone, to avoid stupidity
395	+	// like newlines.
396	+	auto zone = host.find("%25");
397	+	if (idx != std::string::npos) {
398	+	auto host1 = unescape(std::string(host.begin(), host.begin() + zone), encodeHost);
399	+	auto host2 = unescape(std::string(host.begin() + zone, host.begin() + idx), encodeHost);
400	+	auto host3 = unescape(std::string(host.begin() + idx, host.end()), encodeZone);
401	+	return host1 + host2 + host3;
402	+	}
403	+	} else if ((idx = host.find_last_of(':')) < host.size()) {
404	+	auto colonPort = std::string(host.begin() + idx, host.end());
405	+	if (!validOptionalPort(colonPort)) {
406	+	throw std::invalid_argument(fmt::format("invalid port {} after host", colonPort));
407	+	}
408	+	}
409	+
410	+	return unescape(host, encodeHost);
411	+	}
412	+
413	+	void parseAuthority(const std::string &authority, UserInfo &ui, std::string &host) {
414	+	auto i = authority.find_last_of('@');
415	+	if (i > authority.size()) {
416	+	host = parseHost(authority);
417	+	} else {
418	+	host = parseHost(std::string(authority.begin() + i + 1, authority.end()));
419	+	}
420	+
421	+	if (i > authority.size()) {
422	+	return;
423	+	}
424	+
425	+	auto userInfo = std::string(authority.begin(), authority.begin() + i);
426	+	if (!validUserinfo(userInfo)) {
427	+	throw std::invalid_argument("invalid userinfo");
428	+	}
429	+	auto idx = userInfo.find(':');
430	+	if (idx == std::string::npos) {
431	+	userInfo = unescape(userInfo, encodeUserPassword);
432	+	ui = UserInfo(userInfo);
433	+	} else {
434	+	auto username = std::string(userInfo.begin(), userInfo.begin() + idx);
435	+	auto password = std::string(userInfo.begin() + idx, userInfo.end());
436	+	ui.Username = unescape(username, encodeUserPassword);
437	+	ui.Password = unescape(username, encodeUserPassword);
438	+	}
439	+	}
440	+
441	+	URL URL::Parse(std::string url) {
442	+	URL u;
443	+	std::string frag;
444	+	auto hashIndex = url.find("#");
445	+	if (hashIndex != std::string::npos) {
446	+	frag = std::string(url.begin() + hashIndex, url.end());
447	+	url = std::string(url.begin(), url.begin() + hashIndex);
448	+	}
449	+
450	+	u.setFragment(frag);
451	+
452	+	if (stringContainsCTLByte(url)) {
453	+	throw std::invalid_argument("invalid url: string contains control bytes");
454	+	}
455	+
456	+	if (url == "*") {
457	+	u.Path = "*";
458	+	return u;
459	+	}
460	+	auto rest = url;
461	+
462	+	u.Scheme = getScheme(rest);
463	+	std::transform(u.Scheme.begin(), u.Scheme.end(), u.Scheme.begin(), [](unsigned char c) -> unsigned char { return std::tolower(c); });
464	+
465	+	if (!rest.empty() > 0 && rest[rest.size() - 1] == '?') {
466	+	u.ForceQuery = true;
467	+	rest.pop_back();
468	+	} else {
469	+	auto idx = rest.find("?");
470	+	if (idx != std::string::npos) {
471	+	u.RawQuery = std::string(rest.begin() + idx, rest.end());
472	+	rest = std::string(rest.begin(), rest.begin() + idx);
473	+	}
474	+	}
475	+
476	+	if (!rest.empty() && rest[0] != '/') {
477	+	if (!u.Scheme.empty()) {
478	+	// We consider rootless paths per RFC 3986 as opaque.
479	+	u.Opaque = rest;
480	+	return u;
481	+	}
482	+	}
483	+
484	+	if (!u.Scheme.empty() \|\| (rest.find("///") != 0 && rest.find("//") == 0)) {
485	+	auto authority = std::string(rest.begin() + 2, rest.end());
486	+	rest = "";
487	+	int i = authority.find("/");
488	+	if (i != std::string::npos) {
489	+	rest = std::string(authority.begin() + i, authority.end());
490	+	authority = std::string(authority.begin(), authority.begin() + i);
491	+	}
492	+	parseAuthority(authority, u.User, u.Host);
493	+	}
494	+
495	+	u.setPath(rest);
496	+
497	+	return u;
498	+	}
499	+
500	+	void URL::setFragment(const std::string &f) {
501	+	Fragment = unescape(f, encodeFragment);
502	+	auto escf = escape(Fragment, encodeFragment);
503	+	RawFragment = (escf == f) ? "" : f;
504	+	}
505	+
506	+	void URL::setPath(const std::string &p) {
507	+	Path = unescape(p, encodePath);
508	+	auto escp = escape(Path, encodePath);
509	+	RawPath = (escp == p) ? "" : p;
510	+	}
511	+
512	+	std::string URL::PathEscape(const std::string &path) { return escape(path, encodePath); }
513	+	std::string URL::PathUnescape(const std::string &path) { return unescape(path, encodePath); }
514	+	std::string URL::QueryEscape(const std::string &query) { return escape(query, encodeQueryComponent); }
515	+	std::string URL::QueryUnescape(const std::string &query) { return unescape(query, encodeQueryComponent); }

url.h(arquivo criado)

		@@ -0,0 +1,56 @@
1	+	#pragma once
2	+
3	+	#include <algorithm>
4	+	#include <map>
5	+	#include <stdexcept>
6	+
7	+	#include "common/common.h"
8	+
9	+	// Based on Golang Implementation
10	+	// MIT
11	+
12	+	namespace ProtoRock {
13	+	namespace Http {
14	+	struct UserInfo {
15	+	std::string Username;
16	+	std::string Password;
17	+
18	+	UserInfo() {}
19	+	UserInfo(const std::string &u) : Username(u){};
20	+	};
21	+
22	+	struct URL {
23	+	private:
24	+	void setFragment(const std::string &);
25	+	void setPath(const std::string &);
26	+
27	+	public:
28	+	std::string Scheme;
29	+	// encoded opaque data
30	+	std::string Opaque;
31	+	// username and password information
32	+	UserInfo User;
33	+	// host or host:port
34	+	std::string Host;
35	+	// path (relative paths may omit leading slash)
36	+	std::string Path;
37	+	// encoded path hint (see EscapedPath method)
38	+	std::string RawPath;
39	+	// append a query ('?') even if RawQuery is empty
40	+	bool ForceQuery = false;
41	+	// encoded query values, without '?'
42	+	std::string RawQuery;
43	+	// fragment for references, without '#'
44	+	std::string Fragment;
45	+	// encoded fragment hint (see EscapedFragment method)
46	+	std::string RawFragment;
47	+
48	+	static URL Parse(std::string url);
49	+	static std::string PathEscape(const std::string &path);
50	+	static std::string PathUnescape(const std::string &path);
51	+	static std::string QueryEscape(const std::string &query);
52	+	static std::string QueryUnescape(const std::string &query);
53	+	};
54	+
55	+	} // namespace Http
56	+	} // namespace ProtoRock

Próximo Anterior