Última atividade 1 month ago

C++ Implementation of golang URL package - I did that for a project, and I decided to save it here.

racerxdl's Avatar Lucas Teske revisou este gist 3 years ago. Ir para a revisão

Sem alterações

racerxdl's Avatar Lucas Teske revisou este gist 3 years ago. Ir para a revisão

Sem alterações

racerxdl's Avatar Lucas Teske revisou este gist 3 years ago. Ir para a revisão

Sem alterações

racerxdl's Avatar Lucas Teske revisou este gist 3 years ago. Ir para a revisão

2 files changed, 571 insertions

url.cpp(arquivo criado)

@@ -0,0 +1,515 @@
1 + #include "common/url.h"
2 +
3 + #include <fmt/format.h>
4 +
5 + #include <cctype>
6 +
7 + using namespace ProtoRock::Http;
8 +
9 + enum EncodingMode {
10 + encodePath = 1,
11 + encodePathSegment,
12 + encodeHost,
13 + encodeZone,
14 + encodeUserPassword,
15 + encodeQueryComponent,
16 + encodeFragment,
17 + };
18 +
19 + char unhex(char c) {
20 + if ('0' <= c && c <= '9') {
21 + return c - '0';
22 + }
23 + if ('a' <= c && c <= 'f') {
24 + return c - 'a' + 10;
25 + }
26 + if ('A' <= c && c <= 'F') {
27 + return c - 'A' + 10;
28 + }
29 + return 0;
30 + }
31 +
32 + const char *upperhex = "0123456789ABCDEF";
33 +
34 + // Return true if the specified character should be escaped when
35 + // appearing in a URL string, according to RFC 3986.
36 + //
37 + // Please be informed that for now shouldEscape does not check all
38 + // reserved characters correctly. See golang.org/issue/5684.
39 + bool shouldEscape(char c, EncodingMode mode) {
40 + // §2.3 Unreserved characters (alphanum)
41 + if ('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9') {
42 + return false;
43 + }
44 + if (mode == encodeHost || mode == encodeZone) {
45 + // §3.2.2 Host allows
46 + // sub-delims = "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
47 + // as part of reg-name.
48 + // We add : because we include :port as part of host.
49 + // We add [ ] because we include [ipv6]:port as part of host.
50 + // We add < > because they're the only characters left that
51 + // we could possibly allow, and Parse will reject them if we
52 + // escape them (because hosts can't use %-encoding for
53 + // ASCII bytes).
54 + switch (c) {
55 + case '!':
56 + case '$':
57 + case '&':
58 + case '\'':
59 + case '(':
60 + case ')':
61 + case '*':
62 + case '+':
63 + case ',':
64 + case ';':
65 + case '=':
66 + case ':':
67 + case '[':
68 + case ']':
69 + case '<':
70 + case '>':
71 + case '"':
72 + return false;
73 + }
74 + }
75 +
76 + switch (c) {
77 + // §2.3 Unreserved characters (mark)
78 + case '-':
79 + case '_':
80 + case '.':
81 + case '~':
82 + return false;
83 +
84 + // §2.2 Reserved characters (reserved)
85 + case '$':
86 + case '&':
87 + case '+':
88 + case ',':
89 + case '/':
90 + case ':':
91 + case ';':
92 + case '=':
93 + case '?':
94 + case '@':
95 + // Different sections of the URL allow a few of
96 + // the reserved characters to appear unescaped.
97 + switch (mode) {
98 + case encodePath: // §3.3
99 + // The RFC allows : @ & = + $ but saves / ; , for assigning
100 + // meaning to individual path segments. This package
101 + // only manipulates the path as a whole, so we allow those
102 + // last three as well. That leaves only ? to escape.
103 + return c == '?';
104 +
105 + case encodePathSegment: // §3.3
106 + // The RFC allows : @ & = + $ but saves / ; , for assigning
107 + // meaning to individual path segments.
108 + return c == '/' || c == ';' || c == ',' || c == '?';
109 +
110 + case encodeUserPassword: // §3.2.1
111 + // The RFC allows ';', ':', '&', '=', '+', '$', and ',' in
112 + // userinfo, so we must escape only '@', '/', and '?'.
113 + // The parsing of userinfo treats ':' as special so we must escape
114 + // that too.
115 + return c == '@' || c == '/' || c == '?' || c == ':';
116 +
117 + case encodeQueryComponent: // §3.4
118 + // The RFC reserves (so we must escape) everything.
119 + return true;
120 +
121 + case encodeFragment: // §4.1
122 + // The RFC text is silent but the grammar allows
123 + // everything: case so escape nothing.
124 + return false;
125 + }
126 + }
127 +
128 + if (mode == encodeFragment) {
129 + // RFC 3986 §2.2 allows not escaping sub-delims. A subset of sub-delims are
130 + // included in reserved from RFC 2396 §2.2. The remaining sub-delims do not
131 + // need to be escaped. To minimize potential breakage, we apply two restrictions:
132 + // (1) we always escape sub-delims outside of the fragment, and (2) we always
133 + // escape single quote to avoid breaking callers that had previously assumed that
134 + // single quotes would be escaped. See issue #19917.
135 + switch (c) {
136 + case '!':
137 + case '(':
138 + case ')':
139 + case '*':
140 + return false;
141 + }
142 + }
143 +
144 + return true;
145 + }
146 +
147 + std::string escape(const std::string &s, EncodingMode mode) {
148 + auto spaceCount = 0;
149 + auto hexCount = 0;
150 +
151 + for (auto i = 0; i < s.size(); i++) {
152 + auto c = s[i];
153 + if (shouldEscape(c, mode)) {
154 + if (c == ' ' && mode == encodeQueryComponent) {
155 + spaceCount++;
156 + } else {
157 + hexCount++;
158 + }
159 + }
160 + }
161 +
162 + if (spaceCount == 0 && hexCount == 0) {
163 + return s;
164 + }
165 +
166 + auto required = s.size() + 2 * hexCount;
167 + auto t = std::vector<char>();
168 + t.reserve(required);
169 +
170 + if (hexCount == 0) {
171 + t.insert(t.begin(), s.begin(), s.end());
172 + for (auto i = 0; i < s.size(); i++) {
173 + if (s[i] == ' ') {
174 + t[i] = '+';
175 + }
176 + }
177 + return std::string(t.begin(), t.end());
178 + }
179 +
180 + auto j = 0;
181 + auto c = 0;
182 + for (auto i = 0; i < s.size(); i++) {
183 + auto c = s[i];
184 + if (c == ' ' && mode == encodeQueryComponent) {
185 + t[j] = '+';
186 + j++;
187 + } else if (shouldEscape(c, mode)) {
188 + t[j] = '%';
189 + t[j + 1] = upperhex[c >> 4];
190 + t[j + 2] = upperhex[c & 15];
191 + j += 3;
192 + } else {
193 + t[j] = s[i];
194 + j++;
195 + }
196 + }
197 +
198 + return std::string(t.begin(), t.end());
199 + }
200 +
201 + std::string unescape(std::string s, EncodingMode mode) {
202 + // Count %, check that they're well-formed.
203 + auto n = 0;
204 + auto hasPlus = false;
205 + auto tmp = std::string();
206 + auto v = 0;
207 + for (int i = 0; i < s.size();) {
208 + switch (s[i]) {
209 + case '%':
210 + n++;
211 + if (i + 2 >= s.size() || !std::isxdigit(s[i + 1]) || !std::isxdigit(s[i + 2])) {
212 + s = std::string(s.begin() + 1, s.end());
213 + if (s.size() > 3) {
214 + s = std::string(s.begin(), s.begin() + 3);
215 + }
216 + throw std::invalid_argument("escape error: " + s);
217 + }
218 + // Per https://tools.ietf.org/html/rfc3986#page-21
219 + // in the host component %-encoding can only be used
220 + // for non-ASCII bytes.
221 + // But https://tools.ietf.org/html/rfc6874#section-2
222 + // introduces %25 being allowed to escape a percent sign
223 + // in IPv6 scoped-address literals. Yay.
224 + tmp = std::string(s.begin() + i, s.begin() + i + 3);
225 + if (mode == encodeHost && unhex(s[i + 1]) < 8 && tmp != "%25") {
226 + throw std::invalid_argument("escape error: " + tmp);
227 + }
228 +
229 + if (mode == encodeZone) {
230 + // RFC 6874 says basically "anything goes" for zone identifiers
231 + // and that even non-ASCII can be redundantly escaped,
232 + // but it seems prudent to restrict %-escaped bytes here to those
233 + // that are valid host name bytes in their unescaped form.
234 + // That is, you can use escaping in the zone identifier but not
235 + // to introduce bytes you couldn't just write directly.
236 + // But Windows puts spaces here! Yay.
237 + v = unhex(s[i + 1]) << 4 | unhex(s[i + 2]);
238 + tmp = std::string(s.begin() + i, s.begin() + i + 3);
239 + if (tmp != "%25" && v != ' ' && shouldEscape(v, encodeHost)) {
240 + throw std::invalid_argument("escape error: " + tmp);
241 + }
242 + }
243 + i += 3;
244 + break;
245 + case '+':
246 + hasPlus = mode == encodeQueryComponent;
247 + i++;
248 + break;
249 + default:
250 + if ((mode == encodeHost || mode == encodeZone) && (uint8_t)s[i] < 0x80 && shouldEscape(s[i], mode)) {
251 + tmp = std::string(s.begin() + i, s.begin() + i + 1);
252 + throw std::invalid_argument("invalid host: " + tmp);
253 + }
254 + i++;
255 + }
256 + }
257 +
258 + if (n == 0 && !hasPlus) {
259 + return s;
260 + }
261 +
262 + auto ss = std::stringstream();
263 + for (int i = 0; i < s.size(); i++) {
264 + switch (s[i]) {
265 + case '%':
266 + ss << (char)(unhex(s[i + 1]) << 4 | unhex(s[i + 2]));
267 + i += 2;
268 + break;
269 + case '+':
270 + ss << ((mode == encodeQueryComponent) ? ' ' : '+');
271 + break;
272 + default:
273 + ss << s[i];
274 + }
275 + }
276 + return ss.str();
277 + }
278 +
279 + bool stringContainsCTLByte(const std::string &s) {
280 + for (auto c : s) {
281 + if (c < ' ' || c == 0x7f) {
282 + return true;
283 + }
284 + }
285 + return false;
286 + }
287 +
288 + std::string getScheme(std::string &url) {
289 + int i = 0;
290 + std::string scheme;
291 + for (auto &c : url) {
292 + if (('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z')) {
293 + // Do nothing
294 + } else if (('0' <= c && c <= '9') || c == '+' || c == '-' || c == '.') {
295 + if (i == 0) {
296 + break;
297 + }
298 + } else if (c == ':') {
299 + scheme = std::string(url.begin(), url.begin() + i);
300 + url = std::string(url.begin() + i + 1, url.end());
301 + break;
302 + } else {
303 + // we have encountered an invalid character,
304 + // so there is no valid scheme
305 + break;
306 + }
307 + i++;
308 + }
309 + return scheme;
310 + }
311 +
312 + // validUserinfo reports whether s is a valid userinfo string per RFC 3986
313 + // Section 3.2.1:
314 + // userinfo = *( unreserved / pct-encoded / sub-delims / ":" )
315 + // unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~"
316 + // sub-delims = "!" / "$" / "&" / "'" / "(" / ")"
317 + // / "*" / "+" / "," / ";" / "="
318 + //
319 + // It doesn't validate pct-encoded. The caller does that via func unescape.
320 + bool validUserinfo(const std::string &s) {
321 + for (auto r : s) {
322 + if ('A' <= r && r <= 'Z') {
323 + continue;
324 + }
325 + if ('a' <= r && r <= 'z') {
326 + continue;
327 + }
328 + if ('0' <= r && r <= '9') {
329 + continue;
330 + }
331 + switch (r) {
332 + case '-':
333 + case '.':
334 + case '_':
335 + case ':':
336 + case '~':
337 + case '!':
338 + case '$':
339 + case '&':
340 + case '\'':
341 + case '(':
342 + case ')':
343 + case '*':
344 + case '+':
345 + case ',':
346 + case ';':
347 + case '=':
348 + case '%':
349 + case '@':
350 + continue;
351 + default:
352 + return false;
353 + }
354 + }
355 + return true;
356 + }
357 +
358 + // validOptionalPort reports whether port is either an empty string
359 + // or matches /^:\d*$/
360 + bool validOptionalPort(const std::string &port) {
361 + if (port.empty()) {
362 + return true;
363 + }
364 + if (port[0] != ':') {
365 + return false;
366 + }
367 + for (auto b = port.begin() + 1; b < port.end(); b++) {
368 + if (*b < '0' || *b > '9') {
369 + return false;
370 + }
371 + }
372 + return true;
373 + }
374 +
375 + // parseHost parses host as an authority without user
376 + // information. That is, as host[:port].
377 + std::string parseHost(const std::string &host) {
378 + int idx;
379 + if (!host.empty() && host[0] == '[') {
380 + // Parse an IP-Literal in RFC 3986 and RFC 6874.
381 + // E.g., "[fe80::1]", "[fe80::1%25en0]", "[fe80::1]:80".
382 + idx = host.find_last_of(']');
383 + if (idx >= host.size()) {
384 + throw std::invalid_argument("cannot find ']' in host");
385 + }
386 + auto colonPort = std::string(host.begin() + idx + 1, host.end());
387 + if (!validOptionalPort(colonPort)) {
388 + throw std::invalid_argument(fmt::format("invalid port {} after host", colonPort));
389 + }
390 + // RFC 6874 defines that %25 (%-encoded percent) introduces
391 + // the zone identifier, and the zone identifier can use basically
392 + // any %-encoding it likes. That's different from the host, which
393 + // can only %-encode non-ASCII bytes.
394 + // We do impose some restrictions on the zone, to avoid stupidity
395 + // like newlines.
396 + auto zone = host.find("%25");
397 + if (idx != std::string::npos) {
398 + auto host1 = unescape(std::string(host.begin(), host.begin() + zone), encodeHost);
399 + auto host2 = unescape(std::string(host.begin() + zone, host.begin() + idx), encodeHost);
400 + auto host3 = unescape(std::string(host.begin() + idx, host.end()), encodeZone);
401 + return host1 + host2 + host3;
402 + }
403 + } else if ((idx = host.find_last_of(':')) < host.size()) {
404 + auto colonPort = std::string(host.begin() + idx, host.end());
405 + if (!validOptionalPort(colonPort)) {
406 + throw std::invalid_argument(fmt::format("invalid port {} after host", colonPort));
407 + }
408 + }
409 +
410 + return unescape(host, encodeHost);
411 + }
412 +
413 + void parseAuthority(const std::string &authority, UserInfo &ui, std::string &host) {
414 + auto i = authority.find_last_of('@');
415 + if (i > authority.size()) {
416 + host = parseHost(authority);
417 + } else {
418 + host = parseHost(std::string(authority.begin() + i + 1, authority.end()));
419 + }
420 +
421 + if (i > authority.size()) {
422 + return;
423 + }
424 +
425 + auto userInfo = std::string(authority.begin(), authority.begin() + i);
426 + if (!validUserinfo(userInfo)) {
427 + throw std::invalid_argument("invalid userinfo");
428 + }
429 + auto idx = userInfo.find(':');
430 + if (idx == std::string::npos) {
431 + userInfo = unescape(userInfo, encodeUserPassword);
432 + ui = UserInfo(userInfo);
433 + } else {
434 + auto username = std::string(userInfo.begin(), userInfo.begin() + idx);
435 + auto password = std::string(userInfo.begin() + idx, userInfo.end());
436 + ui.Username = unescape(username, encodeUserPassword);
437 + ui.Password = unescape(username, encodeUserPassword);
438 + }
439 + }
440 +
441 + URL URL::Parse(std::string url) {
442 + URL u;
443 + std::string frag;
444 + auto hashIndex = url.find("#");
445 + if (hashIndex != std::string::npos) {
446 + frag = std::string(url.begin() + hashIndex, url.end());
447 + url = std::string(url.begin(), url.begin() + hashIndex);
448 + }
449 +
450 + u.setFragment(frag);
451 +
452 + if (stringContainsCTLByte(url)) {
453 + throw std::invalid_argument("invalid url: string contains control bytes");
454 + }
455 +
456 + if (url == "*") {
457 + u.Path = "*";
458 + return u;
459 + }
460 + auto rest = url;
461 +
462 + u.Scheme = getScheme(rest);
463 + std::transform(u.Scheme.begin(), u.Scheme.end(), u.Scheme.begin(), [](unsigned char c) -> unsigned char { return std::tolower(c); });
464 +
465 + if (!rest.empty() > 0 && rest[rest.size() - 1] == '?') {
466 + u.ForceQuery = true;
467 + rest.pop_back();
468 + } else {
469 + auto idx = rest.find("?");
470 + if (idx != std::string::npos) {
471 + u.RawQuery = std::string(rest.begin() + idx, rest.end());
472 + rest = std::string(rest.begin(), rest.begin() + idx);
473 + }
474 + }
475 +
476 + if (!rest.empty() && rest[0] != '/') {
477 + if (!u.Scheme.empty()) {
478 + // We consider rootless paths per RFC 3986 as opaque.
479 + u.Opaque = rest;
480 + return u;
481 + }
482 + }
483 +
484 + if (!u.Scheme.empty() || (rest.find("///") != 0 && rest.find("//") == 0)) {
485 + auto authority = std::string(rest.begin() + 2, rest.end());
486 + rest = "";
487 + int i = authority.find("/");
488 + if (i != std::string::npos) {
489 + rest = std::string(authority.begin() + i, authority.end());
490 + authority = std::string(authority.begin(), authority.begin() + i);
491 + }
492 + parseAuthority(authority, u.User, u.Host);
493 + }
494 +
495 + u.setPath(rest);
496 +
497 + return u;
498 + }
499 +
500 + void URL::setFragment(const std::string &f) {
501 + Fragment = unescape(f, encodeFragment);
502 + auto escf = escape(Fragment, encodeFragment);
503 + RawFragment = (escf == f) ? "" : f;
504 + }
505 +
506 + void URL::setPath(const std::string &p) {
507 + Path = unescape(p, encodePath);
508 + auto escp = escape(Path, encodePath);
509 + RawPath = (escp == p) ? "" : p;
510 + }
511 +
512 + std::string URL::PathEscape(const std::string &path) { return escape(path, encodePath); }
513 + std::string URL::PathUnescape(const std::string &path) { return unescape(path, encodePath); }
514 + std::string URL::QueryEscape(const std::string &query) { return escape(query, encodeQueryComponent); }
515 + std::string URL::QueryUnescape(const std::string &query) { return unescape(query, encodeQueryComponent); }

url.h(arquivo criado)

@@ -0,0 +1,56 @@
1 + #pragma once
2 +
3 + #include <algorithm>
4 + #include <map>
5 + #include <stdexcept>
6 +
7 + #include "common/common.h"
8 +
9 + // Based on Golang Implementation
10 + // MIT
11 +
12 + namespace ProtoRock {
13 + namespace Http {
14 + struct UserInfo {
15 + std::string Username;
16 + std::string Password;
17 +
18 + UserInfo() {}
19 + UserInfo(const std::string &u) : Username(u){};
20 + };
21 +
22 + struct URL {
23 + private:
24 + void setFragment(const std::string &);
25 + void setPath(const std::string &);
26 +
27 + public:
28 + std::string Scheme;
29 + // encoded opaque data
30 + std::string Opaque;
31 + // username and password information
32 + UserInfo User;
33 + // host or host:port
34 + std::string Host;
35 + // path (relative paths may omit leading slash)
36 + std::string Path;
37 + // encoded path hint (see EscapedPath method)
38 + std::string RawPath;
39 + // append a query ('?') even if RawQuery is empty
40 + bool ForceQuery = false;
41 + // encoded query values, without '?'
42 + std::string RawQuery;
43 + // fragment for references, without '#'
44 + std::string Fragment;
45 + // encoded fragment hint (see EscapedFragment method)
46 + std::string RawFragment;
47 +
48 + static URL Parse(std::string url);
49 + static std::string PathEscape(const std::string &path);
50 + static std::string PathUnescape(const std::string &path);
51 + static std::string QueryEscape(const std::string &query);
52 + static std::string QueryUnescape(const std::string &query);
53 + };
54 +
55 + } // namespace Http
56 + } // namespace ProtoRock
Próximo Anterior