urlencode.py 986 B

123456789101112131415161718192021222324252627282930313233
  1. # Generates urlencoded.txt from utf-8.txt
  2. #
  3. # urlencoded.txt is used by Tests_Formatting_Utf8UriEncode
  4. import urllib, codecs, re
  5. import sys
  6. # uncapitalize pct-encoded values, leave the rest alone
  7. capfix = re.compile("%([0-9A-Z]{2})");
  8. def fix(match):
  9. octet = match.group(1)
  10. intval = int(octet, 16)
  11. if intval < 128:
  12. return chr(intval).lower()
  13. return '%' + octet.lower()
  14. def urlencode(line):
  15. """Percent-encode each byte of non-ASCII unicode characters."""
  16. line = urllib.quote(line.strip().encode("utf-8"))
  17. line = capfix.sub(fix, line)
  18. return line
  19. if __name__ == "__main__":
  20. args = sys.argv[1:]
  21. if args and args[0] in ("-h", "--help"):
  22. print "Usage: python urlencode.py < utf-8.txt > urlencoded.txt"
  23. sys.exit(2)
  24. sys.stdin = codecs.getreader("utf-8")(sys.stdin)
  25. sys.stdout = codecs.getwriter("ascii")(sys.stdout)
  26. lines = sys.stdin.readlines()
  27. sys.stdout.write( "\n".join(map(urlencode, lines)) )