CoCalc -- image_downloader.py

GitHub Repository: jantic/deoldify
Path: blob/master/fastai/widgets/image_downloader.py
⁸⁴⁶ views
1
from ..core import *
2
from ..vision.data import *
3
from ipywidgets import widgets, Layout, Output, HBox, VBox, Text, BoundedIntText, Button, Dropdown, Box
4
from IPython.display import clear_output, display
5
from urllib.parse import quote
6
from bs4 import BeautifulSoup
7
import time
8

9
__all__ = ['ImageDownloader', 'download_google_images']
10

11
_img_sizes = {'>400*300':'isz:lt,islt:qsvga','>640*480':'isz:lt,islt:vga','>800*600':'isz:lt,islt:svga',
12
              '>1024*768':'visz:lt,islt:xga', '>2MP':'isz:lt,islt:2mp','>4MP':'isz:lt,islt:4mp','>6MP':'isz:lt,islt:6mp',
13
              '>8MP':'isz:lt,islt:8mp', '>10MP':'isz:lt,islt:10mp','>12MP':'isz:lt,islt:12mp','>15MP':'isz:lt,islt:15mp',
14
              '>20MP':'isz:lt,islt:20mp','>40MP':'isz:lt,islt:40mp','>70MP':'isz:lt,islt:70mp'}
15

16
class ImageDownloader():
17
    """
18
    Displays a widget that allows searching and downloading images from google images search
19
    in a Jupyter Notebook or Lab.
20
    """
21
    def __init__(self, path:Union[Path,str]='data'):
22
        "Setup path to save images to, init the UI, and render the widgets."
23
        self._path = Path(path)
24
        self._ui = self._init_ui()
25
        self.render()
26

27
    def _init_ui(self) -> VBox:
28
        "Initialize the widget UI and return the UI."
29
        self._search_input = Text(placeholder="What images to search for?")
30
        self._count_input = BoundedIntText(placeholder="How many pics?", value=10, min=1, max=5000, step=1,
31
                                           layout=Layout(width='60px'))
32
        self._size_input = Dropdown(options= _img_sizes.keys(), value='>400*300', layout=Layout(width='120px'))
33
        self._download_button = Button(description="Search & Download", icon="download", layout=Layout(width='200px'))
34
        self._download_button.on_click(self.on_download_button_click)
35
        self._output = Output()
36
        self._controls_pane  = HBox([self._search_input, self._count_input, self._size_input, self._download_button],
37
                                    layout=Layout(width='auto', height='40px'))
38
        self._heading = ""
39
        self._download_complete_heading = "<h3>Download complete. Here are a few images</h3>"
40
        self._preview_header = widgets.HTML(self._heading, layout=Layout(height='60px'))
41
        self._img_pane = Box(layout=Layout(display='inline'))
42
        return VBox([self._controls_pane, self._preview_header, self._img_pane])
43

44
    def render(self) -> None:
45
        clear_output()
46
        display(self._ui)
47

48
    def clear_imgs(self) -> None:
49
        "Clear the widget's images preview pane."
50
        self._preview_header.value = self._heading
51
        self._img_pane.children = tuple()
52

53
    def validate_search_input(self) -> bool:
54
        "Check if input value is empty."
55
        input = self._search_input
56
        if input.value == str(): input.layout = Layout(border="solid 2px red", height='auto')
57
        else:                    self._search_input.layout = Layout()
58
        return input.value != str()
59

60
    def on_download_button_click(self, btn) -> None:
61
        "Download button click handler: validate search term and download images."
62
        term = self._search_input.value
63
        limit = int(self._count_input.value)
64
        size = self._size_input.value
65
        if not self.validate_search_input(): return
66
        self.clear_imgs()
67
        downloaded_images = download_google_images(self._path, term, n_images=limit, size=size)
68
        self.display_images_widgets(downloaded_images[:min(limit, 12)])
69
        self._preview_header.value = self._download_complete_heading
70
        self.render()
71

72
    def display_images_widgets(self, fnames:list) -> None:
73
        "Display a few preview images in the notebook"
74
        imgs = [widgets.Image(value=open(f, 'rb').read(), width='200px') for f in fnames]
75
        self._img_pane.children = tuple(imgs)
76

77

78
def download_google_images(path:PathOrStr, search_term:str, size:str='>400*300', n_images:int=10, format:str='jpg',
79
                            max_workers:int=defaults.cpus, timeout:int=4) -> FilePathList:
80
    """
81
    Search for `n_images` images on Google, matching `search_term` and `size` requirements,
82
    download them into `path`/`search_term` and verify them, using `max_workers` threads.
83
    """
84
    label_path = Path(path)/search_term
85
    search_url = _search_url(search_term, size=size, format=format)
86
    if n_images <= 100: img_tuples = _fetch_img_tuples(search_url, format=format, n_images=n_images)
87
    else:               img_tuples = _fetch_img_tuples_webdriver(search_url, format=format, n_images=n_images)
88
    downloaded_images = _download_images(label_path, img_tuples, max_workers=max_workers, timeout=timeout)
89
    if len(downloaded_images) == 0: raise RuntimeError(f"Couldn't download any images.")
90
    verify_images(label_path, max_workers=max_workers)
91
    return get_image_files(label_path)
92
    
93
def _url_params(size:str='>400*300', format:str='jpg') -> str:
94
    "Build Google Images Search Url params and return them as a string."
95
    _fmts = {'jpg':'ift:jpg','gif':'ift:gif','png':'ift:png','bmp':'ift:bmp', 'svg':'ift:svg','webp':'webp','ico':'ift:ico'}
96
    if size not in _img_sizes: 
97
        raise RuntimeError(f"""Unexpected size argument value: {size}.
98
                    See `widgets.image_downloader._img_sizes` for supported sizes.""") 
99
    if format not in _fmts: 
100
        raise RuntimeError(f"Unexpected image file format: {format}. Use jpg, gif, png, bmp, svg, webp, or ico.")
101
    return "&tbs=" + _img_sizes[size] + "," + _fmts[format]
102

103
def _search_url(search_term:str, size:str='>400*300', format:str='jpg') -> str:
104
    "Return a Google Images Search URL for a given search term."
105
    return ('https://www.google.com/search?q=' + quote(search_term) +
106
            '&espv=2&biw=1366&bih=667&site=webhp&source=lnms&tbm=isch' +
107
            _url_params(size, format) + '&sa=X&ei=XosDVaCXD8TasATItgE&ved=0CAcQ_AUoAg')
108

109
def _img_fname(img_url:str) -> str:
110
    "Return image file name including the extension given its url."
111
    return img_url.split('/')[-1]
112

113
def _fetch_img_tuples(url:str, format:str='jpg', n_images:int=10) -> list:
114
    "Parse the Google Images Search for urls and return the image metadata as tuples (fname, url)."
115
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36'}
116
    html = requests.get(url, headers=headers).text
117
    return _html_to_img_tuples(html, format=format, n_images=n_images)
118

119
def _html_to_img_tuples(html:str, format:str='jpg', n_images:int=10) -> list:    
120
    "Parse the google images html to img tuples containining `(fname, url)`"
121
    bs = BeautifulSoup(html, 'html.parser')
122
    img_tags = bs.find_all('div', {'class': 'rg_meta'})
123
    metadata_dicts = (json.loads(e.text) for e in img_tags)
124
    img_tuples = ((_img_fname(d['ou']), d['ou']) for d in metadata_dicts if d['ity'] == format)
125
    return list(itertools.islice(img_tuples, n_images))
126

127
def _fetch_img_tuples_webdriver(url:str, format:str='jpg', n_images:int=150) -> list:
128
    """
129
    Parse the Google Images Search for urls and return the image metadata as tuples (fname, url).
130
    Use this for downloads of >100 images. Requires `selenium`.
131
    """
132
    try:
133
        from selenium import webdriver
134
        from selenium.webdriver.common.keys import Keys
135
    except:
136
        print("""Looks like you're trying to download > 100 images and `selenium`
137
                is not installed. Try running `pip install selenium` to fix this. 
138
                You'll also need chrome and `chromedriver` installed.""")
139
    options = webdriver.ChromeOptions()
140
    options.add_argument("--headless")
141
    try: driver = webdriver.Chrome(chrome_options=options)
142
    except: print("""Error initializing chromedriver. 
143
                    Check if it's in your path by running `which chromedriver`""")
144
    driver.set_window_size(1440, 900)
145
    driver.get(url)
146

147
    for i in range(n_images // 100 + 1):
148
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight)")
149
        time.sleep(0.5 + random.random()/2.0)
150

151
    n_available = len(driver.find_elements_by_css_selector("div.rg_meta"))
152
    if n_available < n_images:
153
        raise ValueError(f"Requested {n_images} images, but only found {n_available}.")
154

155
    html = driver.page_source
156
    driver.close()
157
    return _html_to_img_tuples(html, format=format, n_images=n_images)
158

159
def _download_images(label_path:PathOrStr, img_tuples:list, max_workers:int=defaults.cpus, timeout:int=4) -> FilePathList:
160
    """
161
    Downloads images in `img_tuples` to `label_path`. 
162
    If the directory doesn't exist, it'll be created automatically.
163
    Uses `parallel` to speed things up in `max_workers` when the system has enough CPU cores.
164
    If something doesn't work, try setting up `max_workers=0` to debug.
165
    """
166
    os.makedirs(Path(label_path), exist_ok=True)
167
    parallel( partial(_download_single_image, label_path, timeout=timeout), img_tuples, max_workers=max_workers)
168
    return get_image_files(label_path)
169

170
def _download_single_image(label_path:Path, img_tuple:tuple, i:int, timeout:int=4) -> None:
171
    """
172
    Downloads a single image from Google Search results to `label_path`
173
    given an `img_tuple` that contains `(fname, url)` of an image to download.
174
    `i` is just an iteration number `int`. 
175
    """
176
    suffix = re.findall(r'\.\w+?(?=(?:\?|$))', img_Tuple[1])
177
    suffix = suffix[0].lower() if len(suffix)>0  else '.jpg'
178
    fname = f"{i:08d}{suffix}"
179
    download_url(img_Tuple[1], label_path/fname, timeout=timeout)
180

181
Product

Resources

Company