Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
jantic
GitHub Repository: jantic/deoldify
Path: blob/master/fastai/widgets/image_downloader.py
781 views
1
from ..core import *
2
from ..vision.data import *
3
from ipywidgets import widgets, Layout, Output, HBox, VBox, Text, BoundedIntText, Button, Dropdown, Box
4
from IPython.display import clear_output, display
5
from urllib.parse import quote
6
from bs4 import BeautifulSoup
7
import time
8
9
__all__ = ['ImageDownloader', 'download_google_images']
10
11
_img_sizes = {'>400*300':'isz:lt,islt:qsvga','>640*480':'isz:lt,islt:vga','>800*600':'isz:lt,islt:svga',
12
'>1024*768':'visz:lt,islt:xga', '>2MP':'isz:lt,islt:2mp','>4MP':'isz:lt,islt:4mp','>6MP':'isz:lt,islt:6mp',
13
'>8MP':'isz:lt,islt:8mp', '>10MP':'isz:lt,islt:10mp','>12MP':'isz:lt,islt:12mp','>15MP':'isz:lt,islt:15mp',
14
'>20MP':'isz:lt,islt:20mp','>40MP':'isz:lt,islt:40mp','>70MP':'isz:lt,islt:70mp'}
15
16
class ImageDownloader():
17
"""
18
Displays a widget that allows searching and downloading images from google images search
19
in a Jupyter Notebook or Lab.
20
"""
21
def __init__(self, path:Union[Path,str]='data'):
22
"Setup path to save images to, init the UI, and render the widgets."
23
self._path = Path(path)
24
self._ui = self._init_ui()
25
self.render()
26
27
def _init_ui(self) -> VBox:
28
"Initialize the widget UI and return the UI."
29
self._search_input = Text(placeholder="What images to search for?")
30
self._count_input = BoundedIntText(placeholder="How many pics?", value=10, min=1, max=5000, step=1,
31
layout=Layout(width='60px'))
32
self._size_input = Dropdown(options= _img_sizes.keys(), value='>400*300', layout=Layout(width='120px'))
33
self._download_button = Button(description="Search & Download", icon="download", layout=Layout(width='200px'))
34
self._download_button.on_click(self.on_download_button_click)
35
self._output = Output()
36
self._controls_pane = HBox([self._search_input, self._count_input, self._size_input, self._download_button],
37
layout=Layout(width='auto', height='40px'))
38
self._heading = ""
39
self._download_complete_heading = "<h3>Download complete. Here are a few images</h3>"
40
self._preview_header = widgets.HTML(self._heading, layout=Layout(height='60px'))
41
self._img_pane = Box(layout=Layout(display='inline'))
42
return VBox([self._controls_pane, self._preview_header, self._img_pane])
43
44
def render(self) -> None:
45
clear_output()
46
display(self._ui)
47
48
def clear_imgs(self) -> None:
49
"Clear the widget's images preview pane."
50
self._preview_header.value = self._heading
51
self._img_pane.children = tuple()
52
53
def validate_search_input(self) -> bool:
54
"Check if input value is empty."
55
input = self._search_input
56
if input.value == str(): input.layout = Layout(border="solid 2px red", height='auto')
57
else: self._search_input.layout = Layout()
58
return input.value != str()
59
60
def on_download_button_click(self, btn) -> None:
61
"Download button click handler: validate search term and download images."
62
term = self._search_input.value
63
limit = int(self._count_input.value)
64
size = self._size_input.value
65
if not self.validate_search_input(): return
66
self.clear_imgs()
67
downloaded_images = download_google_images(self._path, term, n_images=limit, size=size)
68
self.display_images_widgets(downloaded_images[:min(limit, 12)])
69
self._preview_header.value = self._download_complete_heading
70
self.render()
71
72
def display_images_widgets(self, fnames:list) -> None:
73
"Display a few preview images in the notebook"
74
imgs = [widgets.Image(value=open(f, 'rb').read(), width='200px') for f in fnames]
75
self._img_pane.children = tuple(imgs)
76
77
78
def download_google_images(path:PathOrStr, search_term:str, size:str='>400*300', n_images:int=10, format:str='jpg',
79
max_workers:int=defaults.cpus, timeout:int=4) -> FilePathList:
80
"""
81
Search for `n_images` images on Google, matching `search_term` and `size` requirements,
82
download them into `path`/`search_term` and verify them, using `max_workers` threads.
83
"""
84
label_path = Path(path)/search_term
85
search_url = _search_url(search_term, size=size, format=format)
86
if n_images <= 100: img_tuples = _fetch_img_tuples(search_url, format=format, n_images=n_images)
87
else: img_tuples = _fetch_img_tuples_webdriver(search_url, format=format, n_images=n_images)
88
downloaded_images = _download_images(label_path, img_tuples, max_workers=max_workers, timeout=timeout)
89
if len(downloaded_images) == 0: raise RuntimeError(f"Couldn't download any images.")
90
verify_images(label_path, max_workers=max_workers)
91
return get_image_files(label_path)
92
93
def _url_params(size:str='>400*300', format:str='jpg') -> str:
94
"Build Google Images Search Url params and return them as a string."
95
_fmts = {'jpg':'ift:jpg','gif':'ift:gif','png':'ift:png','bmp':'ift:bmp', 'svg':'ift:svg','webp':'webp','ico':'ift:ico'}
96
if size not in _img_sizes:
97
raise RuntimeError(f"""Unexpected size argument value: {size}.
98
See `widgets.image_downloader._img_sizes` for supported sizes.""")
99
if format not in _fmts:
100
raise RuntimeError(f"Unexpected image file format: {format}. Use jpg, gif, png, bmp, svg, webp, or ico.")
101
return "&tbs=" + _img_sizes[size] + "," + _fmts[format]
102
103
def _search_url(search_term:str, size:str='>400*300', format:str='jpg') -> str:
104
"Return a Google Images Search URL for a given search term."
105
return ('https://www.google.com/search?q=' + quote(search_term) +
106
'&espv=2&biw=1366&bih=667&site=webhp&source=lnms&tbm=isch' +
107
_url_params(size, format) + '&sa=X&ei=XosDVaCXD8TasATItgE&ved=0CAcQ_AUoAg')
108
109
def _img_fname(img_url:str) -> str:
110
"Return image file name including the extension given its url."
111
return img_url.split('/')[-1]
112
113
def _fetch_img_tuples(url:str, format:str='jpg', n_images:int=10) -> list:
114
"Parse the Google Images Search for urls and return the image metadata as tuples (fname, url)."
115
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'}
116
html = requests.get(url, headers=headers).text
117
return _html_to_img_tuples(html, format=format, n_images=n_images)
118
119
def _html_to_img_tuples(html:str, format:str='jpg', n_images:int=10) -> list:
120
"Parse the google images html to img tuples containining `(fname, url)`"
121
bs = BeautifulSoup(html, 'html.parser')
122
img_tags = bs.find_all('div', {'class': 'rg_meta'})
123
metadata_dicts = (json.loads(e.text) for e in img_tags)
124
img_tuples = ((_img_fname(d['ou']), d['ou']) for d in metadata_dicts if d['ity'] == format)
125
return list(itertools.islice(img_tuples, n_images))
126
127
def _fetch_img_tuples_webdriver(url:str, format:str='jpg', n_images:int=150) -> list:
128
"""
129
Parse the Google Images Search for urls and return the image metadata as tuples (fname, url).
130
Use this for downloads of >100 images. Requires `selenium`.
131
"""
132
try:
133
from selenium import webdriver
134
from selenium.webdriver.common.keys import Keys
135
except:
136
print("""Looks like you're trying to download > 100 images and `selenium`
137
is not installed. Try running `pip install selenium` to fix this.
138
You'll also need chrome and `chromedriver` installed.""")
139
options = webdriver.ChromeOptions()
140
options.add_argument("--headless")
141
try: driver = webdriver.Chrome(chrome_options=options)
142
except: print("""Error initializing chromedriver.
143
Check if it's in your path by running `which chromedriver`""")
144
driver.set_window_size(1440, 900)
145
driver.get(url)
146
147
for i in range(n_images // 100 + 1):
148
driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
149
time.sleep(0.5 + random.random()/2.0)
150
151
n_available = len(driver.find_elements_by_css_selector("div.rg_meta"))
152
if n_available < n_images:
153
raise ValueError(f"Requested {n_images} images, but only found {n_available}.")
154
155
html = driver.page_source
156
driver.close()
157
return _html_to_img_tuples(html, format=format, n_images=n_images)
158
159
def _download_images(label_path:PathOrStr, img_tuples:list, max_workers:int=defaults.cpus, timeout:int=4) -> FilePathList:
160
"""
161
Downloads images in `img_tuples` to `label_path`.
162
If the directory doesn't exist, it'll be created automatically.
163
Uses `parallel` to speed things up in `max_workers` when the system has enough CPU cores.
164
If something doesn't work, try setting up `max_workers=0` to debug.
165
"""
166
os.makedirs(Path(label_path), exist_ok=True)
167
parallel( partial(_download_single_image, label_path, timeout=timeout), img_tuples, max_workers=max_workers)
168
return get_image_files(label_path)
169
170
def _download_single_image(label_path:Path, img_tuple:tuple, i:int, timeout:int=4) -> None:
171
"""
172
Downloads a single image from Google Search results to `label_path`
173
given an `img_tuple` that contains `(fname, url)` of an image to download.
174
`i` is just an iteration number `int`.
175
"""
176
suffix = re.findall(r'\.\w+?(?=(?:\?|$))', img_Tuple[1])
177
suffix = suffix[0].lower() if len(suffix)>0 else '.jpg'
178
fname = f"{i:08d}{suffix}"
179
download_url(img_Tuple[1], label_path/fname, timeout=timeout)
180
181