CoCalc -- opentable

GitHub Repository: packtpublishing/machine-learning-for-algorithmic-trading-second-edition
Path: blob/master/03_alternative_data/01_opentable/opentable_selenium.py
²⁹²³ views
1
# coding: utf-8
2

3

4
import re
5
from time import sleep
6
import pandas as pd
7
from bs4 import BeautifulSoup
8
from selenium import webdriver
9

10

11
def parse_html(html):
12
    """Parse content from various tags from OpenTable restaurants listing"""
13
    data, item = pd.DataFrame(), {}
14
    soup = BeautifulSoup(html, 'lxml')
15
    for i, resto in enumerate(soup.find_all('div', class_='rest-row-info')):
16
        item['name'] = resto.find('span', class_='rest-row-name-text').text
17

18
        booking = resto.find('div', class_='booking')
19
        item['bookings'] = re.search('\d+', booking.text).group() if booking else 'NA'
20

21
        rating = resto.find('div', class_='star-rating-score')
22
        item['rating'] = float(rating['aria-label'].split()[0]) if rating else 'NA'
23

24
        reviews = resto.find('span', class_='underline-hover')
25
        item['reviews'] = int(re.search('\d+', reviews.text).group()) if reviews else 'NA'
26

27
        item['price'] = int(resto.find('div', class_='rest-row-pricing').find('i').text.count('$'))
28
        item['cuisine'] = resto.find('span', class_='rest-row-meta--cuisine rest-row-meta-text sfx1388addContent').text
29
        item['location'] = resto.find('span', class_='rest-row-meta--location rest-row-meta-text sfx1388addContent').text
30
        data[i] = pd.Series(item)
31
    return data.T
32

33

34
# Start selenium and click through pages until reach end
35
# store results by iteratively appending to csv file
36
driver = webdriver.Firefox()
37
url = "https://www.opentable.com/new-york-restaurant-listings"
38
driver.get(url)
39
page = collected = 0
40
while True:
41
    sleep(1)
42
    new_data = parse_html(driver.page_source)
43
    if new_data.empty:
44
        break
45
    if page == 0:
46
        new_data.to_csv('results.csv', index=False)
47
    elif page > 0:
48
        new_data.to_csv('results.csv', index=False, header=None, mode='a')
49
    page += 1
50
    collected += len(new_data)
51
    print(f'Page: {page} | Downloaded: {collected}')
52
    driver.find_element_by_link_text('Next').click()
53

54
driver.close()
55
restaurants = pd.read_csv('results.csv')
56
print(restaurants)
57

58
Product

Resources

Company