how do I pass the √ untouched

Question:

is it possible to pass the through this untouched or am i asking too much

import urllib.request
path = 'html'
links = 'links'
with open(links, 'r', encoding='UTF-8') as links:
    for link in links: #for each link in the file
        print(link)
        with urllib.request.urlopen(link) as linker: #get the html
            print(linker)
            with open(path, 'ab') as f: #append the html to html
                f.write(linker.read())

links

https://myanimelist.net/anime/27899/Tokyo_Ghoul_√A

output

File "PYdown.py", line 7, in <module>
    with urllib.request.urlopen(link) as linker:
  File "/usr/lib64/python3.6/urllib/request.py", line 223, in urlopen
    return opener.open(url, data, timeout)
  File "/usr/lib64/python3.6/urllib/request.py", line 526, in open
    response = self._open(req, data)
  File "/usr/lib64/python3.6/urllib/request.py", line 544, in _open
    '_open', req)
  File "/usr/lib64/python3.6/urllib/request.py", line 504, in _call_chain
    result = func(*args)
  File "/usr/lib64/python3.6/urllib/request.py", line 1392, in https_open
    context=self._context, check_hostname=self._check_hostname)
  File "/usr/lib64/python3.6/urllib/request.py", line 1349, in do_open
    encode_chunked=req.has_header('Transfer-encoding'))
  File "/usr/lib64/python3.6/http/client.py", line 1254, in request
    self._send_request(method, url, body, headers, encode_chunked)
  File "/usr/lib64/python3.6/http/client.py", line 1265, in _send_request
    self.putrequest(method, url, **skips)
  File "/usr/lib64/python3.6/http/client.py", line 1132, in putrequest
    self._output(request.encode('ascii'))
UnicodeEncodeError: 'ascii' codec can't encode character 'u221a' in position 29: ordinal not in range(128)
Asked By: Nimball Dimpall

||

Answers:

instead of getting python to read as itself, I would have to translate the to %E2%88%9A in order to get python to output

credit
@Olvin Roght

Answered By: Nimball Dimpall

You need to quote Unicode chars in URL. You have file which contains list of urls you need to open, so you need to split each url (using urllib.parse.urlsplit()), quote (with urllib.parse.quote()) host and every part of path (to split paths you can use pathlib.PurePosixPath.parts) and then form URL back (using urllib.parse.urlunsplit()).

from pathlib import PurePosixPath
from urllib.parse import urlsplit, urlunsplit, quote, urlencode, parse_qsl


def normalize_url(url):
    splitted = urlsplit(url)  # split link
    path = PurePosixPath(splitted.path)  # initialize path
    parts = iter(path.parts)  # first element always "/"
    quoted_path = PurePosixPath(next(parts))  # "/"
    for part in parts:
        quoted_path /= quote(part)  # quote each part
    return urlunsplit((
        splitted.scheme,
        splitted.netloc.encode("idna").decode(),  # idna
        str(quoted_path),
        urlencode(parse_qsl(splitted.query)),  # force encode query
        splitted.fragment
    ))

Usage:

links = (
    "https://myanimelist.net/anime/27899/Tokyo_Ghoul_√A",
    "https://stackoverflow.com/",
    "https://www.google.com/search?q=√2&client=firefox-b-d",
    "http://pfarmerü.com/"
)

print(*(normalize_url(link) for link in links), sep="n")

Output:

https://myanimelist.net/anime/27899/Tokyo_Ghoul_%E2%88%9AA
https://stackoverflow.com/
https://www.google.com/search?q=%E2%88%9A2&client=firefox-b-d,
http://xn--pfarmer-t2a.com/
Answered By: Olvin Roght
Categories: questions Tags: ,
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.