This is the error I get:
Traceback (most recent call last):
File "/usr/lib/python3.8/site-packages/urllib3/connectionpool.py", line 594, in urlopen
self._prepare_proxy(conn)
File "/usr/lib/python3.8/site-packages/urllib3/connectionpool.py", line 805, in _prepare_proxy
conn.connect()
File "/usr/lib/python3.8/site-packages/urllib3/connection.py", line 308, in connect
self._tunnel()
File "/usr/lib/python3.8/http/client.py", line 898, in _tunnel
raise OSError("Tunnel connection failed: %d %s" % (code,
OSError: Tunnel connection failed: 403 Forbidden
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/lib/python3.8/site-packages/requests/adapters.py", line 439, in send
resp = conn.urlopen(
File "/usr/lib/python3.8/site-packages/urllib3/connectionpool.py", line 637, in urlopen
retries = retries.increment(method, url, error=e, _pool=self,
File "/usr/lib/python3.8/site-packages/urllib3/util/retry.py", line 399, in increment
raise MaxRetryError(_pool, url, error or ResponseError(cause))
urllib3.exceptions.MaxRetryError: HTTPSConnectionPool(host='accademiadellacrusca.it', port=443): Max retries exceeded with url: /it/lingua-italiana/parole-nuove/ (Caused by ProxyError('Cannot connect to proxy.', OSError('Tunnel connection failed: 403 Forbidden')))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "main.py", line 120, in <module>
response = requests.get(CRUSCA_NUOVE)
File "/usr/lib/python3.8/site-packages/requests/api.py", line 75, in get
return request('get', url, params=params, **kwargs)
File "/usr/lib/python3.8/site-packages/requests/api.py", line 60, in request
conn.connect()
return session.request(method=method, url=url, **kwargs)
File "/usr/lib/python3.8/site-packages/requests/sessions.py", line 533, in request
resp = self.send(prep, **send_kwargs)
File "/usr/lib/python3.8/site-packages/requests/sessions.py", line 646, in send
r = adapter.send(request, **kwargs)
File "/usr/lib/python3.8/site-packages/requests/adapters.py", line 510, in send
raise ProxyError(e, request=request)
requests.exceptions.ProxyError: HTTPSConnectionPool(host='accademiadellacrusca.it', port=443): Max retries exceeded with url: /it/lingua-italiana/parole-nuove/ (Caused by ProxyError('
Cannot connect to proxy.', OSError('Tunnel connection failed: 403 Forbidden')))
This is my code:
from bs4 import BeautifulSoup
import re
import random
import requests
from time import sleep
CRUSCA_NUOVE = "https://accademiadellacrusca.it/it/lingua-italiana/parole-nuove/"
response = requests.get(CRUSCA_NUOVE)
web_page = response.text
soup = BeautifulSoup(web_page, "html.parser")
web_page_tag = soup.select(selector="h2 a")
sleep(5)
# creo una lista
parole_list = []
# rimuovo i tag che hanno l'attributo data-toggle
for value in web_page_tag:
if not value.has_attr('data-toggle') and str(value).find("/it/contenuti/") < 0:
parole_list.append(value)
ambito_duso = ""
definizione = ""
parola = ""
def estrai_definizione(n):
global parole_list
global ambito_duso
global definizione
global parola
parola = parole_list[n].text
link = parole_list[n].get("href")
response = requests.get(f"https://accademiadellacrusca.it{link}")
web_page = response.text
soup = BeautifulSoup(web_page, "html.parser")
# prendo tutto il div che contiene Ambito d'uso e Definizione
div_completo = soup.find(name="div", class_="col-md-9")
div_completo_string = " ".join(str(div_completo).split())
try:
found = re.search("(Definizione.+?\.).+<\/p>.+", div_completo_string).group(1)
clean = re.sub('<[^>]*>', '', found)
definizione = " ".join(clean.split())
except AttributeError:
definizione = "Definizione: nessuna definizione trovata"
try:
found = re.search(".+(Ambito d'uso.+?)<\/p>.+", div_completo_string).group(1)
clean = re.sub('<[^>]*>', '', found)
ambito_duso = " ".join(clean.split())
except AttributeError:
ambito_duso = "Ambito d'uso: nessun ambito d'uso trovato."
contents += "\n***\nParole nuove dalla Accademia della Crusca!"
for n in range(0, 3):
i = random.randint(0, len(parole_list) - 1)
estrai_definizione(i)
contents += f"\n{parola}"
contents += f"\n{ambito_duso}"
contents += f"\n{definizione}"
contents += f"\n"
sleep(5)