Sunday, June 1, 2025

selenium webdriver – Net Scrape and Obtain Pictures with Python Xpath Drawback


I’m making an attempt to obtain all photographs that seem in Google picture search

I’m utilizing “Selenium” with Python, Pycharm

After I examine Chrome developer software to get Xpath i acquired the next:

//[@id=”islrg”]/div[1]/div[1]/a[1]/div[1]/img
//
[@id=”islrg”]/div[1]/div[24]/a[1]/div[1]/img
//[@id=”islrg”]/div[1]/div[26]/a[1]/div[1]/img
//
[@id=”islrg”]/div[1]/div[49]/a[1]/div[1]/img

//[@id=”islrg”]/div[1]/div[51]/div[1]/a[1]/div[1]/img
//
[@id=”islrg”]/div[1]/div[51]/div[24]/a[1]/div[1]/img
//[@id=”islrg”]/div[1]/div[51]/div[26]/a[1]/div[1]/img
//
[@id=”islrg”]/div[1]/div[51]/div[49]/a[1]/div[1]/img
//*[@id=”islrg”]/div[1]/div[51]/div[51]/a[1]/div[1]/img

//[@id=”islrg”]/div[1]/div[52]/div[20]/a[1]/div[1]/img
//
[@id=”islrg”]/div[1]/div[52]/div[45]/a[1]/div[1]/img
//*[@id=”islrg”]/div[1]/div[52]/div[47]/a[1]/div[1]/img

//[@id=”islrg”]/div[1]/div[52]/div[70]/a[1]/div[1]/img
//
[@id=”islrg”]/div[1]/div[52]/div[72]/a[1]/div[1]/img
//[@id=”islrg”]/div[1]/div[52]/div[95]/a[1]/div[1]/img
//
[@id=”islrg”]/div[1]/div[53]/div[18]/a[1]/div[1]/img

1- Easy methods to get dynamically Xpath?!

after which making an attempt to get the precise picture url however i acquired encrypted ones
(https://encrypted-tbn0.gstatic.com/photographs?q=tbn:ANd9GcTWbVxeA8AzNYZoHKN91jM2UMG_g1pjlWN5kQ&usqp=CAU) by instance?

I publish my entire code right here:

import os
import time
import requests  # Import the requests module
from selenium import webdriver
from selenium.webdriver.chrome.choices import Choices
from selenium.webdriver.frequent.by import By
from PIL import Picture
import io
import pandas as pd
import sys

import msvcrt, time

books = []
def get_images_from_google(driver, delay, max_images):
    def scroll_down(driver):
        driver.execute_script("window.scrollTo(0, doc.physique.scrollHeight);")
        time.sleep(delay)

    image_urls = set()
    skips = 0

    whereas len(image_urls) + skips < max_images:
        scroll_down(driver)

        thumbnails = driver.find_elements(By.CSS_SELECTOR, '.rg_i, .Q4LuWd')
        for img in thumbnails[len(image_urls) + skips:max_images]:
            strive:
                img.click on()
                time.sleep(delay)
            besides:
                proceed

            photographs = driver.find_elements(By.CSS_SELECTOR, '.sFlh5c, .pT0Scc, .iPVvYb')
            for picture in photographs:
                if picture.get_attribute('src') in image_urls:
                    max_images += 1
                    skips += 1
                    break

                if picture.get_attribute('src') and 'http' in picture.get_attribute('src'):
                    image_urls.add(picture.get_attribute('src'))
                    print(f"Discovered {len(image_urls)}")


    return image_urls

def download_image(download_path, url, file_name):
    strive:
        image_content = requests.get(url).content material
        image_file = io.BytesIO(image_content)
        picture = Picture.open(image_file)
        print(url)

        # Verify if the picture might be recognized and is in a suitable format
        if picture.format not in ["JPEG", "PNG"]:
            print(f"Skipping picture with unsupported format: {url}")
            return

        file_path = os.path.be a part of(download_path, file_name)  # Use os.path.be a part of to make sure appropriate path

        with open(file_path, "wb") as f:
            picture.save(f, "JPEG")

        print("Success")
    besides Exception as e:
        print('FAILED -', e)


# Ask the person for the search question
search_query = enter("Enter your Google Pictures search question: ")

# Create the 'imgs/' listing if it would not exist
download_path = "imgs"
os.makedirs(download_path, exist_ok=True)

# Create a Chrome driver
choices = Choices()
choices.add_argument("--start-maximized")
driver = webdriver.Chrome(choices=choices)

# Open the Google Pictures search web page with the offered search question
search_url = f"https://www.google.com/search?q={search_query}&tbm=isch"
driver.get(search_url)

# Carry out picture scraping and downloading
urls = get_images_from_google(driver, 0.01, 10000)


for i, url in enumerate(urls):
        download_image(download_path, url, str(i) + ".JPEG")
        books.append([url])


df = pd.DataFrame(books, columns=['URL'])
df.to_csv('books.csv')
# Shut the motive force occasion
driver.give up()

2-how to enhance it to get the precise URL hyperlink of every picture?

Related Articles

LEAVE A REPLY

Please enter your comment!
Please enter your name here

Latest Articles

PHP Code Snippets Powered By : XYZScripts.com