Commit 409d275d authored by Roger Michel's avatar Roger Michel
Browse files

Auto-saving for roger.michel@stud.hslu.ch on branch master from commit fcd81614

parent fcd81614
Pipeline #448511 passed with stage
in 22 seconds
This diff is collapsed.
""" Class containing all functions needed to download images from Google
Following example set by: https://towardsdatascience.com/image-scraping-with-python-a96feda8af2d
Adapted by: Simon van Hemert, Pascal Baumann <pascal.baumann@hslu.ch>
Date edited: 2022.10.11"""
import time
import os
import io
from PIL import Image
import requests
import pathlib
class Image_crawling:
def __init__(self, drive, waittime=0.1):
self.sleep_between_interactions = waittime
self.drive = drive
# fix page stuck on consent form
search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"
# load the page
self.drive.get(search_url.format(q='cat'))
# accept cookies consent form
self.drive.find_element('xpath', '//button').click()
def fetch_image_urls(self, query: str, max_links_to_fetch: int):
self.query = query
# build the google query
search_url = "https://www.google.com/search?safe=off&site=&tbm=isch&source=hp&q={q}&oq={q}&gs_l=img"
# load the page
self.drive.get(search_url.format(q=query))
image_urls = set()
image_count = 0
results_start = 0
while image_count < max_links_to_fetch:
self.scroll_to_end()
# get all image thumbnail results
thumbnail_results = self.drive.find_elements('css selector', "img.Q4LuWd")
number_results = len(thumbnail_results)
# print(f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}")
for img in thumbnail_results[results_start:number_results]:
# try to click every thumbnail such that we can get the real image behind it
try:
img.click()
time.sleep(self.sleep_between_interactions)
except Exception as e:
print("Exception", e, "occured in clicking on thumbnails ")
continue
# extract image urls
actual_images = self.drive.find_elements('css selector', 'img.n3VNCb')
for actual_image in actual_images:
if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'):
image_urls.add(actual_image.get_attribute('src'))
image_count = len(image_urls)
if len(image_urls) >= max_links_to_fetch:
print(f"Found {len(image_urls)} image links")
break
else:
print("Found:", len(image_urls), "image links, looking for more ...")
time.sleep(5)
return
load_more_button = self.drive.find_elements('css selector', ".mye4qd")
if load_more_button:
self.drive.execute_script("document.querySelector('.mye4qd').click();")
# move the result startpoint further down
results_start = len(thumbnail_results)
self.image_urls = image_urls
return image_urls
def scroll_to_end(self):
self.drive.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(self.sleep_between_interactions)
def download_image(self, folder_path:str):
imagenr = 0
for url in self.image_urls:
pathlib.Path(folder_path).mkdir(parents=True, exist_ok=True)
try:
image_content = requests.get(url, timeout=5).content
except Exception as e:
print(f"ERROR - Could not download {url} - {e}")
try:
image_file = io.BytesIO(image_content)
image = Image.open(image_file).convert('RGB')
file_path = os.path.join(folder_path, self.query + str(imagenr) + '.jpg')
with open(file_path, 'wb') as f:
image.save(f, "JPEG", quality=85)
imagenr += 1
except Exception as e:
print(f"ERROR - Could not save {url} - {e}")
print("Saved", imagenr, "images")
%% Cell type:code id:8a662d04-a60c-40b1-bc89-7bc606c1a9dc tags:
```
```
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment