Commit 409d275d authored by Roger Michel's avatar Roger Michel
Browse files

Auto-saving for on branch master from commit fcd81614

parent fcd81614
Pipeline #448511 passed with stage
in 22 seconds
This diff is collapsed.
""" Class containing all functions needed to download images from Google
Following example set by:
Adapted by: Simon van Hemert, Pascal Baumann <>
Date edited: 2022.10.11"""
import time
import os
import io
from PIL import Image
import requests
import pathlib
class Image_crawling:
def __init__(self, drive, waittime=0.1):
self.sleep_between_interactions = waittime = drive
# fix page stuck on consent form
search_url = "{q}&oq={q}&gs_l=img"
# load the page'cat'))
# accept cookies consent form'xpath', '//button').click()
def fetch_image_urls(self, query: str, max_links_to_fetch: int):
self.query = query
# build the google query
search_url = "{q}&oq={q}&gs_l=img"
# load the page
image_urls = set()
image_count = 0
results_start = 0
while image_count < max_links_to_fetch:
# get all image thumbnail results
thumbnail_results ='css selector', "img.Q4LuWd")
number_results = len(thumbnail_results)
# print(f"Found: {number_results} search results. Extracting links from {results_start}:{number_results}")
for img in thumbnail_results[results_start:number_results]:
# try to click every thumbnail such that we can get the real image behind it
except Exception as e:
print("Exception", e, "occured in clicking on thumbnails ")
# extract image urls
actual_images ='css selector', 'img.n3VNCb')
for actual_image in actual_images:
if actual_image.get_attribute('src') and 'http' in actual_image.get_attribute('src'):
image_count = len(image_urls)
if len(image_urls) >= max_links_to_fetch:
print(f"Found {len(image_urls)} image links")
print("Found:", len(image_urls), "image links, looking for more ...")
load_more_button ='css selector', ".mye4qd")
if load_more_button:"document.querySelector('.mye4qd').click();")
# move the result startpoint further down
results_start = len(thumbnail_results)
self.image_urls = image_urls
return image_urls
def scroll_to_end(self):"window.scrollTo(0, document.body.scrollHeight);")
def download_image(self, folder_path:str):
imagenr = 0
for url in self.image_urls:
pathlib.Path(folder_path).mkdir(parents=True, exist_ok=True)
image_content = requests.get(url, timeout=5).content
except Exception as e:
print(f"ERROR - Could not download {url} - {e}")
image_file = io.BytesIO(image_content)
image ='RGB')
file_path = os.path.join(folder_path, self.query + str(imagenr) + '.jpg')
with open(file_path, 'wb') as f:, "JPEG", quality=85)
imagenr += 1
except Exception as e:
print(f"ERROR - Could not save {url} - {e}")
print("Saved", imagenr, "images")
%% Cell type:code id:8a662d04-a60c-40b1-bc89-7bc606c1a9dc tags:
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment