-
-
Notifications
You must be signed in to change notification settings - Fork 197
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implement new approach for URI normalization (fixes #287)
- Loading branch information
1 parent
1162ff7
commit 7d91ca0
Showing
2 changed files
with
53 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -48,6 +48,53 @@ public static function unescapeAndEncode($uri) | |
{ | ||
$decoded = html_entity_decode($uri); | ||
|
||
return strtr(rawurlencode(rawurldecode($decoded)), self::$dontEncode); | ||
return self::encode(self::decode($decoded)); | ||
} | ||
|
||
/** | ||
* Decode a percent-encoded URI | ||
* | ||
* @param string $uri | ||
* | ||
* @return string | ||
*/ | ||
private static function decode($uri) | ||
{ | ||
return preg_replace_callback('/%([0-9a-f]{2})/iu', function($matches) { | ||
This comment has been minimized.
Sorry, something went wrong.
This comment has been minimized.
Sorry, something went wrong.
colinodell
Author
Member
|
||
// Convert percent-encoded codes to uppercase | ||
$upper = strtoupper($matches[0]); | ||
// Keep excluded characters as-is | ||
if (array_key_exists($upper, self::$dontEncode)) { | ||
return $upper; | ||
} | ||
|
||
// Otherwise, return the character for this codepoint | ||
return chr(hexdec($matches[1])); | ||
}, $uri); | ||
} | ||
|
||
/** | ||
* Encode a URI, preserving already-encoded and excluded characters | ||
* | ||
* @param string $uri | ||
* | ||
* @return string | ||
*/ | ||
private static function encode($uri) | ||
{ | ||
return preg_replace_callback('/(%[0-9a-f]{2})|./iu', function($matches){ | ||
// Keep already-encoded characters as-is | ||
if (count($matches) > 1) { | ||
return $matches[0]; | ||
} | ||
|
||
// Keep excluded characters as-is | ||
if (in_array($matches[0], self::$dontEncode)) { | ||
return $matches[0]; | ||
} | ||
|
||
// Otherwise, encode the character | ||
return rawurlencode($matches[0]); | ||
}, $uri); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
I'm wondering, if you remove
/i
and just addA-F
to the pattern, which micro-optimization is "better"? or in other words, how long the input string has to be for this to even start to matter?