본문 바로가기
코딩 이야기/학습용 소스코드

Source Code - Google image Crawling (Selenium)

by 아재코더 2021. 12. 14.

Web crawler를 작성하기 위해, 사전작업으로 chrome webdriver와 Selenium를 설치해야 합니다.

https://chromedriver.chromium.org/downloads에 접속 후 본인의 Chrome version에 적합한 chrome webdriver를 설치합니다.(Version이 정확히 일치하지 않아도 실행됩니다.)

Google image API를 사용하여 스크랩핑(Scraping)하고자 했으나 지금은 막혀 있습니다. API를 사용하지 않고 다소 아날로그틱한 방법으로 크롤링(Crawling)은 가능하더군요…

아래 코드는 저도 뜯어보고 있는 중이랍니다. 아직 실력이 박약해서...

from selenium import webdriver

from selenium.webdriver.common.keys import Keys

from bs4 import BeautifulSoup

 

import json

import os

import argparse

import sys

 

import requests

import urllib

import urllib3

from urllib3.exceptions import InsecureRequestWarning

 

import datetime

import time

 

urllib3.disable_warnings(InsecureRequestWarning)

 

searchword1 = '코로나'

searchword2 = ''

searchword3 = ''

searchurl = 'https://www.google.com/search?q=' + searchword1 + \

'+' + searchword2 + '+' + searchword3 + '&source=lnms&tbm=isch'

dirs = 'pictures'

maxcount = 50

 

chromedriver = 'C://Program Files//Google//Chrome//Application//chromedriver.exe'

 

if not os.path.exists(dirs):

os.mkdir(dirs)

 

def download_google_staticimages():

 

options = webdriver.ChromeOptions()

options.add_argument('--no-sandbox')

#options.add_argument('--headless')

 

try:

browser = webdriver.Chrome(chromedriver, options=options)

except Exception as e:

print(f'No found chromedriver in this environment.')

print(f'Install on your machine. exception: {e}')

sys.exit()

 

browser.set_window_size(1280, 1024)

browser.get(searchurl)

time.sleep(1)

 

print(f'Getting you a lot of images. This may take a few moments...')

 

element = browser.find_element_by_tag_name('body')

 

# Scroll down

#for i in range(30):

for i in range(maxcount):

element.send_keys(Keys.PAGE_DOWN)

time.sleep(0.3)

 

try:

browser.find_element_by_id('smb').click()

for i in range(50):

element.send_keys(Keys.PAGE_DOWN)

time.sleep(0.3)

except:

for i in range(10):

element.send_keys(Keys.PAGE_DOWN)

time.sleep(0.3)

 

print(f'Reached end of page.')

time.sleep(0.5)

print(f'Retry')

time.sleep(0.5)

 

browser.find_element_by_xpath('//input[@value="show more result"]').click()

 

#Scroll down 2

for i in range(50):

element.send_keys(Keys.PAGE_DOWN)

time.sleep(0.3)

 

try:

browser.find_element_by_id('smb').click()

for i in range(50):

element.send_keys(Keys.PAGE_DOWN)

time.sleep(0.3)

except:

for i in range(10):

element.send_keys(Keys.PAGE_DOWN)

time.sleep(0.3)

 

elements = browser.find_elements_by_xpath('//div[@id="islrg"]')

page_source = elements[0].get_attribute('innerHTML')

page_source = browser.page_source

 

soup = BeautifulSoup(page_source, 'lxml')

images = soup.find_all('img')

 

urls = []

for image in images:

try:

url = image['data-src']

if not url.find('https://'):

urls.append(url)

except:

try:

url = image['src']

if not url.find('https://'):

urls.append(image['src'])

except Exception as e:

print(f'No found image sources.')

print(e)

 

count = 0

if urls:

for url in urls:

try:

res = requests.get(url, verify=False, stream=True)

rawdata = res.raw.read()

with open(os.path.join(dirs, 'img_' + str(count) + '.jpg'), 'wb') as f:

f.write(rawdata)

count += 1

except Exception as e:

print('Failed to write rawdata.')

print(e)

 

browser.close()

return count

 

# Main block

 

def main():

t0 = time.time()

count = download_google_staticimages()

t1 = time.time()

 

total_time = t1 - t0

print(f'\n')

print(f'Download completed. [Successful count = {count}].')

print(f'Total time is {str(total_time)} seconds.')

 

if __name__ == '__main__':

main()

 

아래는 이래저래 도움을 받고 있는 사이트 입니다. 여러분들도 활용해보세요~!

 

W3Schools Free Online Web Tutorials

W3Schools offers free online tutorials, references and exercises in all the major languages of the web. Covering popular subjects like HTML, CSS, JavaScript, Python, SQL, Java, and many, many more.

www.w3schools.com:443

댓글