Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
packtpublishing
GitHub Repository: packtpublishing/machine-learning-for-algorithmic-trading-second-edition
Path: blob/master/03_alternative_data/01_opentable/opentable_selenium.py
2923 views
1
# coding: utf-8
2
3
4
import re
5
from time import sleep
6
import pandas as pd
7
from bs4 import BeautifulSoup
8
from selenium import webdriver
9
10
11
def parse_html(html):
12
"""Parse content from various tags from OpenTable restaurants listing"""
13
data, item = pd.DataFrame(), {}
14
soup = BeautifulSoup(html, 'lxml')
15
for i, resto in enumerate(soup.find_all('div', class_='rest-row-info')):
16
item['name'] = resto.find('span', class_='rest-row-name-text').text
17
18
booking = resto.find('div', class_='booking')
19
item['bookings'] = re.search('\d+', booking.text).group() if booking else 'NA'
20
21
rating = resto.find('div', class_='star-rating-score')
22
item['rating'] = float(rating['aria-label'].split()[0]) if rating else 'NA'
23
24
reviews = resto.find('span', class_='underline-hover')
25
item['reviews'] = int(re.search('\d+', reviews.text).group()) if reviews else 'NA'
26
27
item['price'] = int(resto.find('div', class_='rest-row-pricing').find('i').text.count('$'))
28
item['cuisine'] = resto.find('span', class_='rest-row-meta--cuisine rest-row-meta-text sfx1388addContent').text
29
item['location'] = resto.find('span', class_='rest-row-meta--location rest-row-meta-text sfx1388addContent').text
30
data[i] = pd.Series(item)
31
return data.T
32
33
34
# Start selenium and click through pages until reach end
35
# store results by iteratively appending to csv file
36
driver = webdriver.Firefox()
37
url = "https://www.opentable.com/new-york-restaurant-listings"
38
driver.get(url)
39
page = collected = 0
40
while True:
41
sleep(1)
42
new_data = parse_html(driver.page_source)
43
if new_data.empty:
44
break
45
if page == 0:
46
new_data.to_csv('results.csv', index=False)
47
elif page > 0:
48
new_data.to_csv('results.csv', index=False, header=None, mode='a')
49
page += 1
50
collected += len(new_data)
51
print(f'Page: {page} | Downloaded: {collected}')
52
driver.find_element_by_link_text('Next').click()
53
54
driver.close()
55
restaurants = pd.read_csv('results.csv')
56
print(restaurants)
57
58