amazon-reviews-scraper/core_utils.py at master · sekokaze/amazon-reviews-scraper

History

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

import errno

from time import sleep

import json

import logging

import os

import re

import requests

from bs4 import BeautifulSoup

from banned_exception import BannedException

from constants import AMAZON_BASE_URL

OUTPUT_DIR = 'comments'

if not os.path.exists(OUTPUT_DIR):

os.makedirs(OUTPUT_DIR)

def get_reviews_filename(product_id):

filename = os.path.join(OUTPUT_DIR, '{}.json'.format(product_id))

exist = os.path.isfile(filename)

return filename, exist

def mkdir_p(path):

try:

os.makedirs(path)

except OSError as exc:

if exc.errno == errno.EEXIST and os.path.isdir(path):

pass

else:

raise

def persist_comment_to_disk(reviews):

if len(reviews) == 0:

return False

product_id_set = set([r['product_id'] for r in reviews])

assert len(product_id_set) == 1, 'all product ids should be the same in the reviews list.'

product_id = next(iter(product_id_set))

output_filename, exist = get_reviews_filename(product_id)

if exist:

return False

mkdir_p(OUTPUT_DIR)

# https://stackoverflow.com/questions/18337407/saving-utf-8-texts-in-json-dumps-as-utf8-not-as-u-escape-sequence/18337754

with open(output_filename, 'w', encoding='utf-8') as fp:

json.dump(reviews, fp, sort_keys=True, indent=4, ensure_ascii=False)

return True

def extract_product_id(link_from_main_page):

# e.g. B01H8A7Q42

p_id = -1

tags = ['/dp/', '/gp/product/']

for tag in tags:

try:

p_id = link_from_main_page[link_from_main_page.index(tag) + len(tag):].split('/')[0]

except:

pass

m = re.match('[A-Z0-9]{10}', p_id)

if m:

return m.group()

else:

return None

def get_soup(url):

if AMAZON_BASE_URL not in url:

url = AMAZON_BASE_URL + url

nap_time_sec = 1

logging.debug('Script is going to sleep for {} (Amazon throttling). ZZZzzzZZZzz.'.format(nap_time_sec))

sleep(nap_time_sec)

header = {

'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.134 Safari/537.36'

}

logging.debug('-> to Amazon : {}'.format(url))

out = requests.get(url, headers=header)

assert out.status_code == 200

soup = BeautifulSoup(out.content, 'lxml')

if 'captcha' in str(soup):

raise BannedException('Your bot has been detected. Please wait a while.')

return soup

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

core_utils.py

core_utils.py

Files

core_utils.py

Latest commit

History

core_utils.py

File metadata and controls