Skip to content

chapter3 question #55

Open
Open
@liaoran

Description

@liaoran

hi,little sister,
in chapter3,Collect all the list of ExternalLinks,The code in the book is wrong with the splitAddress function times:

即将获取链接的URL是:/

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-2-4254ccc1a2c3> in <module>()
     64             getAllExternalLinks(link)
     65 
---> 66 getAllExternalLinks("http://oreilly.com")

<ipython-input-2-4254ccc1a2c3> in getAllExternalLinks(siteUrl)
     62             print("即将获取链接的URL是:" + link)
     63             allIntLinks.add(link)
---> 64             getAllExternalLinks(link)
     65 
     66 getAllExternalLinks("http://oreilly.com")

<ipython-input-2-4254ccc1a2c3> in getAllExternalLinks(siteUrl)
     62             print("即将获取链接的URL是:" + link)
     63             allIntLinks.add(link)
---> 64             getAllExternalLinks(link)
     65 
     66 getAllExternalLinks("http://oreilly.com")

<ipython-input-2-4254ccc1a2c3> in getAllExternalLinks(siteUrl)
     62             print("即将获取链接的URL是:" + link)
     63             allIntLinks.add(link)
---> 64             getAllExternalLinks(link)
     65 
     66 getAllExternalLinks("http://oreilly.com")

<ipython-input-2-4254ccc1a2c3> in getAllExternalLinks(siteUrl)
     49 allIntLinks = set()
     50 def getAllExternalLinks(siteUrl):
---> 51     html = urlopen(siteUrl)
     52     bs = BeautifulSoup(html,"html.parser")
     53     internalLinks = getInternalLinks(bs,splitAddress(siteUrl)[0])

~/anaconda3/lib/python3.6/urllib/request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    221     else:
    222         opener = _opener
--> 223     return opener.open(url, data, timeout)
    224 
    225 def install_opener(opener):

~/anaconda3/lib/python3.6/urllib/request.py in open(self, fullurl, data, timeout)
    509         # accept a URL or a Request object
    510         if isinstance(fullurl, str):
--> 511             req = Request(fullurl, data)
    512         else:
    513             req = fullurl

~/anaconda3/lib/python3.6/urllib/request.py in __init__(self, url, data, headers, origin_req_host, unverifiable, method)
    327                  origin_req_host=None, unverifiable=False,
    328                  method=None):
--> 329         self.full_url = url
    330         self.headers = {}
    331         self.unredirected_hdrs = {}

~/anaconda3/lib/python3.6/urllib/request.py in full_url(self, url)
    353         self._full_url = unwrap(url)
    354         self._full_url, self.fragment = splittag(self._full_url)
--> 355         self._parse()
    356 
    357     @full_url.deleter

~/anaconda3/lib/python3.6/urllib/request.py in _parse(self)
    382         self.type, rest = splittype(self._full_url)
    383         if self.type is None:
--> 384             raise ValueError("unknown url type: %r" % self.full_url)
    385         self.host, self.selector = splithost(rest)
    386         if self.host:

ValueError: unknown url type: '/'

Using the code you provide on GitHub is wrong:

Traceback (most recent call last):
  File "/home/kongnian/PycharmProjects/Scraping/getAllExternalLinks.py", line 81, in <module>
    getAllExternalLinks("http://oreilly.com")
  File "/home/kongnian/PycharmProjects/Scraping/getAllExternalLinks.py", line 76, in getAllExternalLinks
    getAllExternalLinks(link)
  File "/home/kongnian/PycharmProjects/Scraping/getAllExternalLinks.py", line 76, in getAllExternalLinks
    getAllExternalLinks(link)
  File "/home/kongnian/PycharmProjects/Scraping/getAllExternalLinks.py", line 76, in getAllExternalLinks
    getAllExternalLinks(link)
  [Previous line repeated 15 more times]
  File "/home/kongnian/PycharmProjects/Scraping/getAllExternalLinks.py", line 63, in getAllExternalLinks
    html = urlopen(siteUrl)
  File "/home/kongnian/anaconda3/lib/python3.6/urllib/request.py", line 223, in urlopen
    return opener.open(url, data, timeout)
  File "/home/kongnian/anaconda3/lib/python3.6/urllib/request.py", line 532, in open
    response = meth(req, response)
  File "/home/kongnian/anaconda3/lib/python3.6/urllib/request.py", line 642, in http_response
    'http', request, response, code, msg, hdrs)
  File "/home/kongnian/anaconda3/lib/python3.6/urllib/request.py", line 564, in error
    result = self._call_chain(*args)
  File "/home/kongnian/anaconda3/lib/python3.6/urllib/request.py", line 504, in _call_chain
    result = func(*args)
  File "/home/kongnian/anaconda3/lib/python3.6/urllib/request.py", line 756, in http_error_302
    return self.parent.open(new, timeout=req.timeout)
  File "/home/kongnian/anaconda3/lib/python3.6/urllib/request.py", line 532, in open
    response = meth(req, response)
  File "/home/kongnian/anaconda3/lib/python3.6/urllib/request.py", line 642, in http_response
    'http', request, response, code, msg, hdrs)
  File "/home/kongnian/anaconda3/lib/python3.6/urllib/request.py", line 570, in error
    return self._call_chain(*args)
  File "/home/kongnian/anaconda3/lib/python3.6/urllib/request.py", line 504, in _call_chain
    result = func(*args)
  File "/home/kongnian/anaconda3/lib/python3.6/urllib/request.py", line 650, in http_error_default
    raise HTTPError(req.full_url, code, msg, hdrs, fp)
urllib.error.HTTPError: HTTP Error 404: Not Found

Is this the end of the collection?

thanks!

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions