Forums

Error while trying to use BeautifulSoup

This is the error I get:

Traceback (most recent call last):
  File "/usr/lib/python3.8/site-packages/urllib3/connectionpool.py", line 594, in urlopen
    self._prepare_proxy(conn)
  File "/usr/lib/python3.8/site-packages/urllib3/connectionpool.py", line 805, in _prepare_proxy
    conn.connect()
  File "/usr/lib/python3.8/site-packages/urllib3/connection.py", line 308, in connect
    self._tunnel()
  File "/usr/lib/python3.8/http/client.py", line 898, in _tunnel
    raise OSError("Tunnel connection failed: %d %s" % (code,
OSError: Tunnel connection failed: 403 Forbidden

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/lib/python3.8/site-packages/requests/adapters.py", line 439, in send
    resp = conn.urlopen(
  File "/usr/lib/python3.8/site-packages/urllib3/connectionpool.py", line 637, in urlopen
    retries = retries.increment(method, url, error=e, _pool=self,
  File "/usr/lib/python3.8/site-packages/urllib3/util/retry.py", line 399, in increment
    raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='accademiadellacrusca.it', port=443): Max retries exceeded with url: /it/lingua-italiana/parole-nuove/ (Caused by ProxyError('Cannot connect to proxy.', OSError('Tunnel connection failed: 403 Forbidden')))

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "main.py", line 120, in <module>
    response = requests.get(CRUSCA_NUOVE)
  File "/usr/lib/python3.8/site-packages/requests/api.py", line 75, in get
    return request('get', url, params=params, **kwargs)
  File "/usr/lib/python3.8/site-packages/requests/api.py", line 60, in request
    conn.connect()
    return session.request(method=method, url=url, **kwargs)
  File "/usr/lib/python3.8/site-packages/requests/sessions.py", line 533, in request
    resp = self.send(prep, **send_kwargs)
  File "/usr/lib/python3.8/site-packages/requests/sessions.py", line 646, in send
    r = adapter.send(request, **kwargs)
  File "/usr/lib/python3.8/site-packages/requests/adapters.py", line 510, in send
    raise ProxyError(e, request=request)
requests.exceptions.ProxyError: HTTPSConnectionPool(host='accademiadellacrusca.it', port=443): Max retries exceeded with url: /it/lingua-italiana/parole-nuove/ (Caused by ProxyError('
Cannot connect to proxy.', OSError('Tunnel connection failed: 403 Forbidden')))

This is my code:

from bs4 import BeautifulSoup
import re
import random
import requests
from time import sleep

CRUSCA_NUOVE = "https://accademiadellacrusca.it/it/lingua-italiana/parole-nuove/"

response = requests.get(CRUSCA_NUOVE)
web_page = response.text
soup = BeautifulSoup(web_page, "html.parser")
web_page_tag = soup.select(selector="h2 a")

sleep(5)

# creo una lista
parole_list = []
# rimuovo i tag che hanno l'attributo data-toggle
for value in web_page_tag:
    if not value.has_attr('data-toggle') and str(value).find("/it/contenuti/") < 0:
        parole_list.append(value)

ambito_duso = ""
definizione = ""
parola = ""

def estrai_definizione(n):
    global parole_list
    global ambito_duso
    global definizione
    global parola

    parola = parole_list[n].text
    link = parole_list[n].get("href")

    response = requests.get(f"https://accademiadellacrusca.it{link}")
    web_page = response.text

    soup = BeautifulSoup(web_page, "html.parser")

    # prendo tutto il div che contiene Ambito d'uso e Definizione
    div_completo = soup.find(name="div", class_="col-md-9")
    div_completo_string = " ".join(str(div_completo).split())

    try:
        found = re.search("(Definizione.+?\.).+<\/p>.+", div_completo_string).group(1)
        clean = re.sub('<[^>]*>', '', found)
        definizione = " ".join(clean.split())
    except AttributeError:
        definizione = "Definizione: nessuna definizione trovata"

    try:
        found = re.search(".+(Ambito d'uso.+?)<\/p>.+", div_completo_string).group(1)
        clean = re.sub('<[^>]*>', '', found)
        ambito_duso = " ".join(clean.split())
    except AttributeError:
        ambito_duso = "Ambito d'uso: nessun ambito d'uso trovato."


contents += "\n***\nParole nuove dalla Accademia della Crusca!"

for n in range(0, 3):
    i = random.randint(0, len(parole_list) - 1)
    estrai_definizione(i)
    contents += f"\n{parola}"
    contents += f"\n{ambito_duso}"
    contents += f"\n{definizione}"
    contents += f"\n"
    sleep(5)

Free accounts have internet access limited to whitelisted sites

can you add https://www.reachsci.org/mini-phd-stem to the allowed websites list.

See https://help.pythonanywhere.com/pages/RequestingAllowlistAdditions/