imageScraper/imageScraper.py at main · MR-GREEN1337/imageScraper

History

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

import os

from icrawler.builtin import BingImageCrawler, BaiduImageCrawler

from bs4 import BeautifulSoup

import requests

import sys

def download_images(image_urls, folder_path):

if not os.path.exists(folder_path):

os.makedirs(folder_path)

count = 1

for url in image_urls:

try:

image_content = requests.get(url).content

image_file_path = os.path.join(folder_path, f"image_{count}.jpg")

with open(image_file_path, 'wb') as image_file:

image_file.write(image_content)

count += 1

if count > max_images:

break

except Exception as e:

print(f"Could not download {url}: {e}")

# Read command line arguments

if len(sys.argv) < 3:

print("Usage: python main.py <nb_of_images_per_source> <request>")

sys.exit(1)

count = int(sys.argv[1])

keyword = ' '.join(sys.argv[2:])

# Create directories to store the images

if not os.path.exists(keyword):

os.makedirs(keyword)

# Bing Image Crawler

bing_crawler = BingImageCrawler(downloader_threads=4, storage={'root_dir': f"{keyword}/bing"})

bing_crawler.crawl(keyword=keyword, max_num=count)

# Baidu Image Crawler with error handling

try:

baidu_crawler = BaiduImageCrawler(downloader_threads=4, storage={'root_dir': f"{keyword}/baidu"})

baidu_crawler.crawl(keyword=keyword, max_num=count)

except KeyError as e:

print(f"Error with Baidu Crawler: {e}")

# Google scraper

query = keyword

url = f"https://www.google.com/search?q={query}&tbm=isch"

response = requests.get(url)

soup = BeautifulSoup(response.text, "html.parser")

images = soup.find_all("img")

image_urls = []

for img in images:

try:

if img['src'].startswith('http'):

image_urls.append(img['src'])

except KeyError:

continue

# Download images from Google

max_images = count # Set max_images to count value from arguments

download_images(image_urls, f"{keyword}/google")

print("Image scraping completed.")

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

imageScraper.py

imageScraper.py

Files

imageScraper.py

Latest commit

History

imageScraper.py

File metadata and controls