forked from philipperemy/amazon-reviews-scraper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
core_utils.py
83 lines (69 loc) · 2.39 KB
/
core_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import errno
from time import sleep
import json
import logging
import os
import re
import requests
from bs4 import BeautifulSoup
from banned_exception import BannedException
from constants import AMAZON_BASE_URL
OUTPUT_DIR = 'comments'
if not os.path.exists(OUTPUT_DIR):
os.makedirs(OUTPUT_DIR)
def get_reviews_filename(product_id):
filename = os.path.join(OUTPUT_DIR, '{}.json'.format(product_id))
exist = os.path.isfile(filename)
return filename, exist
def mkdir_p(path):
try:
os.makedirs(path)
except OSError as exc:
if exc.errno == errno.EEXIST and os.path.isdir(path):
pass
else:
raise
def persist_comment_to_disk(reviews):
if len(reviews) == 0:
return False
product_id_set = set([r['product_id'] for r in reviews])
assert len(product_id_set) == 1, 'all product ids should be the same in the reviews list.'
product_id = next(iter(product_id_set))
output_filename, exist = get_reviews_filename(product_id)
if exist:
return False
mkdir_p(OUTPUT_DIR)
# https://stackoverflow.com/questions/18337407/saving-utf-8-texts-in-json-dumps-as-utf8-not-as-u-escape-sequence/18337754
with open(output_filename, 'w', encoding='utf-8') as fp:
json.dump(reviews, fp, sort_keys=True, indent=4, ensure_ascii=False)
return True
def extract_product_id(link_from_main_page):
# e.g. B01H8A7Q42
p_id = -1
tags = ['/dp/', '/gp/product/']
for tag in tags:
try:
p_id = link_from_main_page[link_from_main_page.index(tag) + len(tag):].split('/')[0]
except:
pass
m = re.match('[A-Z0-9]{10}', p_id)
if m:
return m.group()
else:
return None
def get_soup(url):
if AMAZON_BASE_URL not in url:
url = AMAZON_BASE_URL + url
nap_time_sec = 1
logging.debug('Script is going to sleep for {} (Amazon throttling). ZZZzzzZZZzz.'.format(nap_time_sec))
sleep(nap_time_sec)
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36'
}
logging.debug('-> to Amazon : {}'.format(url))
out = requests.get(url, headers=header)
assert out.status_code == 200
soup = BeautifulSoup(out.content, 'lxml')
if 'captcha' in str(soup):
raise BannedException('Your bot has been detected. Please wait a while.')
return soup