Cadeau
import os
import requests
from bs4 import BeautifulSoup
def urls_articles(url_page):
response = requests.get(url_page)
soup = BeautifulSoup(response.content, "html.parser")
print(f"Scraping de : {soup.title.text}")
return set([url['href'] for url in soup.find_all("a", class_="c-link") if url['href'].startswith('/actualite-economique/actualites/')])
def image_parser(url_article):
content = requests.get(url_article).content
soup = BeautifulSoup(content, "html.parser")
images = [url['data-original-src'] for url in soup.find_all('div', class_='c-thumb-lazy')]
return images
def download(image_url, directory):
filename = image_url.split('/')[-1]
content = requests.get(image_url).content
filepath = os.path.join(directory, filename)
if not os.path.exists(filepath):
with open(filepath, 'wb') as file:
file.write(content)
if __name__ == "__main__":
directory = "Images"
if not os.path.exists(directory):
os.makedirs(directory)
# On récupère les articles des 10 premières pages d'actualités
for i in range(1, 11):
boursorama_url = f"https://www.boursorama.com/actualite-economique/page-{str(i)}"
for article in urls_articles(boursorama_url):
url = "https://www.boursorama.com" + article
print(url)
for image_url in image_parser(url):
print('--->', image_url)
download(image_url, directory)
print("\n")
On récupère toutes les images de tous les articles des 10 premières pages.