Path: blob/master/03_alternative_data/01_opentable/opentable_selenium.py
2923 views
# coding: utf-8123import re4from time import sleep5import pandas as pd6from bs4 import BeautifulSoup7from selenium import webdriver8910def parse_html(html):11"""Parse content from various tags from OpenTable restaurants listing"""12data, item = pd.DataFrame(), {}13soup = BeautifulSoup(html, 'lxml')14for i, resto in enumerate(soup.find_all('div', class_='rest-row-info')):15item['name'] = resto.find('span', class_='rest-row-name-text').text1617booking = resto.find('div', class_='booking')18item['bookings'] = re.search('\d+', booking.text).group() if booking else 'NA'1920rating = resto.find('div', class_='star-rating-score')21item['rating'] = float(rating['aria-label'].split()[0]) if rating else 'NA'2223reviews = resto.find('span', class_='underline-hover')24item['reviews'] = int(re.search('\d+', reviews.text).group()) if reviews else 'NA'2526item['price'] = int(resto.find('div', class_='rest-row-pricing').find('i').text.count('$'))27item['cuisine'] = resto.find('span', class_='rest-row-meta--cuisine rest-row-meta-text sfx1388addContent').text28item['location'] = resto.find('span', class_='rest-row-meta--location rest-row-meta-text sfx1388addContent').text29data[i] = pd.Series(item)30return data.T313233# Start selenium and click through pages until reach end34# store results by iteratively appending to csv file35driver = webdriver.Firefox()36url = "https://www.opentable.com/new-york-restaurant-listings"37driver.get(url)38page = collected = 039while True:40sleep(1)41new_data = parse_html(driver.page_source)42if new_data.empty:43break44if page == 0:45new_data.to_csv('results.csv', index=False)46elif page > 0:47new_data.to_csv('results.csv', index=False, header=None, mode='a')48page += 149collected += len(new_data)50print(f'Page: {page} | Downloaded: {collected}')51driver.find_element_by_link_text('Next').click()5253driver.close()54restaurants = pd.read_csv('results.csv')55print(restaurants)565758