CoCalc -- datasets.py

GitHub Repository: dbolya/tide
Path: blob/master/tidecv/datasets.py
¹¹⁰ views
1
from .data import Data
2
from . import functions as f
3

4
import zipfile
5
from pathlib import Path
6
from appdirs import user_data_dir
7
import urllib.request
8
from collections import defaultdict
9
import shutil
10
import json
11
import os
12

13
def default_name(path:str) -> str:
14
	return os.path.splitext(os.path.basename(path))[0]
15

16
def get_tide_path():
17
	if 'TIDE_PATH' in os.environ:
18
		tide_path = os.environ['TIDE_PATH']
19
	else:
20
		tide_path = user_data_dir('tidecv', appauthor=False)
21

22
	if not os.path.exists(tide_path):
23
		os.makedirs(tide_path)
24
	
25
	return tide_path
26

27
def download_annotations(name:str, url:str, force_download:bool=False) -> str:
28
	tide_path = get_tide_path()
29
	candidate_path = os.path.join(tide_path, name)
30
	finished_file_path = os.path.join(candidate_path, '_finished')
31
	zip_file_path = os.path.join(candidate_path, '_tmp.zip')
32

33
	# Check if the file has already been downloaded
34
	# If there isn't a file called _finished, that means we didn't finish downloading last time, so try again
35
	already_downloaded = os.path.exists(candidate_path) and os.path.exists(finished_file_path)
36

37
	if not force_download and already_downloaded:
38
		return candidate_path
39
	else:
40
		print('{} annotations not found. Downloading...'.format(name))
41
		
42
		if os.path.exists(candidate_path):
43
			shutil.rmtree(candidate_path)
44
		os.makedirs(candidate_path)
45

46
		urllib.request.urlretrieve(url, zip_file_path)
47
		with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
48
			zip_ref.extractall(candidate_path)
49

50
		os.remove(zip_file_path)
51
		open(finished_file_path, 'a').close() # Make an empty _finished file to mark that we were successful
52

53
		print('Successfully downloaded {} to "{}"'.format(name, candidate_path))
54
		return candidate_path
55

56

57

58

59

60
def COCO(path:str=None, name:str=None, year:int=2017, ann_set:str='val', force_download:bool=False) -> Data:
61
	"""
62
	Loads ground truth from a COCO-style annotation file.
63
	
64
	If path is not specified, this will download the COCO annotations for the year and ann_set specified.
65
	Valid years are 2014, 2017 and valid ann_sets are 'val' and 'train'.
66
	"""
67
	if path is None:
68
		path = download_annotations(
69
			'COCO{}'.format(year),
70
			'http://images.cocodataset.org/annotations/annotations_trainval{}.zip'.format(year),
71
			force_download)
72

73
		path = os.path.join(path, 'annotations', 'instances_{}{}.json'.format(ann_set, year))
74
	
75
	if name is None: name = default_name(path)
76
	
77
	with open(path, 'r') as json_file:
78
		cocojson = json.load(json_file)
79
	
80
	images = cocojson['images']
81
	anns   = cocojson['annotations']
82
	cats   = cocojson['categories'] if 'categories' in cocojson else None
83

84

85
	# Add everything from the coco json into our data structure
86
	data = Data(name, max_dets=100)
87

88
	image_lookup = {}
89

90
	for idx, image in enumerate(images):
91
		image_lookup[image['id']] = image
92
		data.add_image(image['id'], image['file_name'])
93

94
	if cats is not None:
95
		for cat in cats:
96
			data.add_class(cat['id'], cat['name'])
97

98
	for ann in anns:
99
		image  = ann['image_id']
100
		_class = ann['category_id']
101
		box    = ann['bbox']
102
		mask   = f.toRLE(ann['segmentation'], image_lookup[image]['width'], image_lookup[image]['height'])
103
		
104
		if ann['iscrowd']: data.add_ignore_region(image, _class, box, mask)
105
		else:              data.add_ground_truth (image, _class, box, mask)
106
	
107
	return data
108

109
def COCOResult(path:str, name:str=None) -> Data:
110
	""" Loads predictions from a COCO-style results file. """
111
	if name is None: name = default_name(path)
112
	
113
	with open(path, 'r') as json_file:
114
		dets = json.load(json_file)
115

116
	data = Data(name)
117

118
	for det in dets:
119
		image = det['image_id']
120
		_cls  = det['category_id']
121
		score = det['score']
122
		box   = det['bbox']         if 'bbox'         in det else None
123
		mask  = det['segmentation'] if 'segmentation' in det else None
124

125
		data.add_detection(image, _cls, score, box, mask)
126
	
127
	return data
128
	
129

130

131
def LVIS(path:str=None, name:str=None, version_str:str='v1', force_download:bool=False) -> Data:
132
	"""
133
	Load an LVIS-style dataset.
134
	The version string is used for downloading the dataset and should be one of the versions of LVIS (e.g., v0.5, v1).
135

136
	Note that LVIS evaulation is special, but we can emulate it by adding ignore regions.
137
	The detector isn't punished for predicted class that LVIS annotators haven't guarenteed are in
138
	the image (i.e., the sum of GT annotated classes in the image and those marked explicitly not
139
	in the image.) In order to emulate this behavior, add ignore region labels for every class not
140
	found to be in the image. This is not that inefficient because ignore regions are separate out
141
	during mAP calculation and error processing, so adding a bunch of them doesn't hurt.
142
	
143
	The LVIS AP numbers are slightly lower than what the LVIS API reports because of these workarounds.
144
	"""
145
	if path is None:
146
		path = download_annotations(
147
			'LVIS{}'.format(version_str),
148
			'https://s3-us-west-2.amazonaws.com/dl.fbaipublicfiles.com/LVIS/lvis_{}_val.json.zip'.format(version_str),
149
			force_download)
150

151
		path = os.path.join(path, 'lvis_{}_val.json'.format(version_str))
152
		
153
	
154
	if name is None: name = default_name(path)
155
	
156

157
	with open(path, 'r') as json_file:
158
		lvisjson = json.load(json_file)
159
	
160
	images = lvisjson['images']
161
	anns   = lvisjson['annotations']
162
	cats   = lvisjson['categories'] if 'categories' in lvisjson else None
163

164
	data = Data(name, max_dets=300)
165
	image_lookup = {}
166
	classes_in_img = defaultdict(lambda: set())
167

168
	for image in images:
169
		image_lookup[image['id']] = image
170
		data.add_image(image['id'], image['coco_url']) # LVIS has no image names, only coco urls
171

172
		# Negative categories are guarenteed by the annotators to not be in the image.
173
		# Thus we should care about them during evaluation.
174
		for cat_id in image['neg_category_ids']:
175
			classes_in_img[image['id']].add(cat_id)
176

177
	if cats is not None:
178
		for cat in cats:
179
			data.add_class(cat['id'], cat['synset'])
180

181
	for ann in anns:
182
		image  = ann['image_id']
183
		_class = ann['category_id']
184
		box    = ann['bbox']
185
		mask   = f.toRLE(ann['segmentation'], image_lookup[image]['width'], image_lookup[image]['height'])
186
		
187
		data.add_ground_truth(image, _class, box, mask)
188

189
		# There's an annotation for this class, so we should consider the class for evaluation.
190
		classes_in_img[image].add(_class)
191

192
	all_classes = set(data.classes.keys())
193

194
	# LVIS doesn't penalize the detector for detecting classes that the annotators haven't guarenteed to be in/out of
195
	# the image. Here we simulate that property by adding ignore regions for all such classes.
196
	for image in images:
197
		ignored_classes = all_classes.difference(classes_in_img[image['id']])
198

199
		# LVIS doesn't penalize the detector for mistakes made on classes explicitly marked as not exhaustively annoted
200
		# We can emulate this by adding ignore regions for every category listed, so add them to the ignored classes.
201
		ignored_classes.update(set(image['not_exhaustive_category_ids']))
202

203
		for _cls in ignored_classes:
204
			data.add_ignore_region(image['id'], _cls)
205

206
	return data
207

208
def LVISResult(path:str, name:str=None) -> Data:
209
	""" Loads predictions from a LVIS-style results file. Note that this is the same as a COCO-style results file. """
210
	return COCOResult(path, name)
211

212

213
def Pascal(path:str=None, name:str=None, year:int=2007, ann_set:str='val', force_download:bool=False) -> Data:
214
	"""
215
	Loads the Pascal VOC 2007 or 2012 data from a COCO json.
216
	
217
	Valid years are 2007 and 2012, and valid ann_sets are 'train' and 'val'.
218
	"""
219
	if path is None:
220
		path = download_annotations(
221
			'Pascal',
222
			'https://s3.amazonaws.com/images.cocodataset.org/external/external_PASCAL_VOC.zip',
223
			force_download)
224
		
225
		path = os.path.join(path, 'PASCAL_VOC', 'pascal_{}{}.json'.format(ann_set, year))
226

227
	return COCO(path, name)
228

229

230

231
def Cityscapes(path:str, name:str=None):
232
	"""
233
	Loads the fine cityscapes annotations as instance segmentation masks, and also generates bounding boxes for them.
234

235
	Note that we can't automatically download Cityscapes because it requires registration and an agreement to the ToS.
236
	You can get cityscapes here: https://www.cityscapes-dataset.com/
237

238
	Path should be to gtFine/<ann_set>. E.g., <path_to_cityscapes>/gtFine/val.
239
	"""
240
	if name is None: name = default_name(path)
241
	data = Data(name)
242

243
	instance_classes = {
244
		'person'    : 24,
245
		'rider'     : 25,
246
		'car'       : 26,
247
		'truck'     : 27,
248
		'train'     : 31,
249
		'motorcycle': 32,
250
		'bicycle'   : 33,
251
		'bus'       : 28,
252
		'caravan'   : 29,
253
		'trailer'   : 30,
254
	}
255

256
	ignored_classes = set([29, 30])
257

258
	for class_name, class_id in instance_classes.items():
259
		data.add_class(class_id, class_name)
260

261
	for ann in Path(path).glob('*/*.json'):
262
		with open(ann) as json_file:
263
			ann_json = json.load(json_file)
264

265
		# Note: a string for an image ID is okay
266
		image_id = os.path.basename(ann).replace('_gtFine_polygons.json', '')
267
		objs = ann_json['objects']
268

269
		data.add_image(image_id, image_id) # The id in this case is just the name of the image
270

271
		# Caravan and Trailer should be ignored from all evaluation
272
		for _cls in ignored_classes:
273
			data.add_ignore_region(image_id, _cls)
274

275
		for obj in objs:
276
			class_label = obj['label']
277
			is_crowd = False
278

279
			# Cityscapes labelers can label objects without defined boundaries as 'group'. In COCO-land this would be
280
			# a crowd annotation. So in this case, let's make it an ignore region.
281
			if class_label.endswith('group'):
282
				is_crowd = True
283
				class_label = class_label[:-5] # Remove the group at the end
284

285
			# We are only considering instance classes
286
			if not class_label in instance_classes:
287
				continue
288
			
289
			class_id = instance_classes[class_label]
290

291
			# If the class is not used in evaluation, don't include it
292
			if class_id in ignored_classes:
293
				continue
294
			
295
			poly = [sum(obj['polygon'], [])] # Converts a list of points to a list of lists of ints, where every 2 ints represents a point
296
			box  = f.polyToBox(poly)
297

298
			if is_crowd:
299
				data.add_ignore_region(image_id, class_id, box, poly)
300
			else:
301
				data.add_ground_truth(image_id, class_id, box, poly)
302

303
	return data
304

305

306

307

308

309
Product

Resources

Company