123456789101112131415161718192021222324252627282930313233 |
- # Generates urlencoded.txt from utf-8.txt
- #
- # urlencoded.txt is used by Tests_Formatting_Utf8UriEncode
- import urllib, codecs, re
- import sys
- # uncapitalize pct-encoded values, leave the rest alone
- capfix = re.compile("%([0-9A-Z]{2})");
- def fix(match):
- octet = match.group(1)
- intval = int(octet, 16)
- if intval < 128:
- return chr(intval).lower()
- return '%' + octet.lower()
- def urlencode(line):
- """Percent-encode each byte of non-ASCII unicode characters."""
- line = urllib.quote(line.strip().encode("utf-8"))
- line = capfix.sub(fix, line)
- return line
- if __name__ == "__main__":
- args = sys.argv[1:]
- if args and args[0] in ("-h", "--help"):
- print "Usage: python urlencode.py < utf-8.txt > urlencoded.txt"
- sys.exit(2)
- sys.stdin = codecs.getreader("utf-8")(sys.stdin)
- sys.stdout = codecs.getwriter("ascii")(sys.stdout)
-
- lines = sys.stdin.readlines()
- sys.stdout.write( "\n".join(map(urlencode, lines)) )
|