amazon-reviews-scraper/core_generate_product_ids.py at master · sekokaze/amazon-reviews-scraper

History

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

import logging

import random

from banned_exception import BannedException

from constants import AMAZON_BASE_URL

from core_utils import get_soup, extract_product_id

def extract_product_ids_from_link(category_link):

category_link_soup = get_soup(category_link)

products_links_1 = [a.attrs['href'] for a in category_link_soup.find_all('a')

if 'href' in a.attrs and '/gp/product/' in a.attrs['href']]

products_links_2 = [a.attrs['href'] for a in category_link_soup.find_all('a')

if 'href' in a.attrs and '/dp/' in a.attrs['href']]

products_links = products_links_1 + products_links_2

products_ids = list(map(extract_product_id, products_links))

products_ids = list(filter(None.__ne__, products_ids)) # remove None values

return products_ids

def get_random_product_ids(output_filename):

# this function has a random behavior! It's like restoring from a checkpoint

# because a new call will yield new values.

logging.info('Writing to {}'.format(output_filename))

with open(output_filename, 'w') as o:

main_category_page = get_soup(AMAZON_BASE_URL + '/gp/site-directory/ref=nav_shopall_btn')

# can have more by clicking on those buttons.

category_links_soup = main_category_page.find_all('a', {'class': 'nav_a'})

category_links = [a.attrs['href'] for a in category_links_soup]

all_product_ids = set()

more_category_links = list(category_links)

for it, category_link in enumerate(category_links):

try:

logging.info('({}/{}) get as many links as we can.'.format(it, len(category_links)))

category_link_soup = get_soup(category_link)

new_links = [a.attrs['href'] for a in category_link_soup.find_all('a')

if 'href' in a.attrs and a.attrs['href'].startswith('/s/')] # or /b/

more_category_links.extend(new_links)

logging.info('{} links found so far.'.format(len(more_category_links)))

except BannedException as be:

raise be

except Exception as e:

logging.error('Exception occurred. Skipping')

logging.error(e)

random.shuffle(more_category_links)

it = 0

while len(more_category_links) > 0:

it += 1

logging.info('Stack length = {}'.format(len(more_category_links)))

category_link = more_category_links.pop()

try:

logging.info('({}/{}) get as many products as we can.'.format(it, len(more_category_links)))

cur_product_ids = extract_product_ids_from_link(category_link)

logging.info(cur_product_ids)

for product_id in cur_product_ids:

if product_id not in all_product_ids:

all_product_ids.add(product_id)

o.write('{}\n'.format(product_id))

o.flush()

logging.info('{} products found at this step.'.format(len(cur_product_ids)))

logging.info('{} unique products found so far.'.format(len(all_product_ids)))

if len(cur_product_ids) > 0:

for jj in range(2, 50):

if 'page' in category_link:

break

more_category_links.append(category_link + '&page={}'.format(jj))

except BannedException as be:

raise be

except Exception as e:

logging.error('Exception occurred. Skipping')

logging.error(e)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

core_generate_product_ids.py

core_generate_product_ids.py

Files

core_generate_product_ids.py

Latest commit

History

core_generate_product_ids.py

File metadata and controls