Path: blob/master/chapter12_object-detection.ipynb
709 views
Kernel: Python 3
This is a companion notebook for the book Deep Learning with Python, Third Edition. For readability, it only contains runnable code blocks and section titles, and omits everything else in the book: text paragraphs, figures, and pseudocode.
If you want to be able to follow what's going on, I recommend reading the notebook side by side with your copy of the book.
The book's contents are available online at deeplearningwithpython.io.
In [0]:
!pip install keras keras-hub --upgrade -q
In [0]:
import os os.environ["KERAS_BACKEND"] = "jax"
In [0]:
# @title import os from IPython.core.magic import register_cell_magic @register_cell_magic def backend(line, cell): current, required = os.environ.get("KERAS_BACKEND", ""), line.split()[-1] if current == required: get_ipython().run_cell(cell) else: print( f"This cell requires the {required} backend. To run it, change KERAS_BACKEND to " f"\"{required}\" at the top of the notebook, restart the runtime, and rerun the notebook." )
Object detection
Single-stage vs. two-stage object detectors
Two-stage R-CNN detectors
Single-stage detectors
Training a YOLO model from scratch
Downloading the COCO dataset
In [0]:
import keras import keras_hub images_path = keras.utils.get_file( "coco", "http://images.cocodataset.org/zips/train2017.zip", extract=True, ) annotations_path = keras.utils.get_file( "annotations", "http://images.cocodataset.org/annotations/annotations_trainval2017.zip", extract=True, )
In [0]:
import json with open(f"{annotations_path}/annotations/instances_train2017.json", "r") as f: annotations = json.load(f) images = {image["id"]: image for image in annotations["images"]} def scale_box(box, width, height): scale = 1.0 / max(width, height) x, y, w, h = [v * scale for v in box] x += (height - width) * scale / 2 if height > width else 0 y += (width - height) * scale / 2 if width > height else 0 return [x, y, w, h] metadata = {} for annotation in annotations["annotations"]: id = annotation["image_id"] if id not in metadata: metadata[id] = {"boxes": [], "labels": []} image = images[id] box = scale_box(annotation["bbox"], image["width"], image["height"]) metadata[id]["boxes"].append(box) metadata[id]["labels"].append(annotation["category_id"]) metadata[id]["path"] = images_path + "/train2017/" + image["file_name"] metadata = list(metadata.values())
In [0]:
len(metadata)
In [0]:
min([len(x["boxes"]) for x in metadata])
In [0]:
max([len(x["boxes"]) for x in metadata])
In [0]:
max(max(x["labels"]) for x in metadata) + 1
In [0]:
metadata[435]
In [0]:
[keras_hub.utils.coco_id_to_name(x) for x in metadata[435]["labels"]]
In [0]:
import matplotlib.pyplot as plt from matplotlib.colors import hsv_to_rgb from matplotlib.patches import Rectangle color_map = {0: "gray"} def label_to_color(label): if label not in color_map: h, s, v = (len(color_map) * 0.618) % 1, 0.5, 0.9 color_map[label] = hsv_to_rgb((h, s, v)) return color_map[label] def draw_box(ax, box, text, color): x, y, w, h = box ax.add_patch(Rectangle((x, y), w, h, lw=2, ec=color, fc="none")) textbox = dict(fc=color, pad=1, ec="none") ax.text(x, y, text, c="white", size=10, va="bottom", bbox=textbox) def draw_image(ax, image): ax.set(xlim=(0, 1), ylim=(1, 0), xticks=[], yticks=[], aspect="equal") image = plt.imread(image) height, width = image.shape[:2] hpad = (1 - height / width) / 2 if width > height else 0 wpad = (1 - width / height) / 2 if height > width else 0 extent = [wpad, 1 - wpad, 1 - hpad, hpad] ax.imshow(image, extent=extent)
In [0]:
sample = metadata[435] ig, ax = plt.subplots(dpi=300) draw_image(ax, sample["path"]) for box, label in zip(sample["boxes"], sample["labels"]): label_name = keras_hub.utils.coco_id_to_name(label) draw_box(ax, box, label_name, label_to_color(label)) plt.show()
In [0]:
import random metadata = list(filter(lambda x: len(x["boxes"]) <= 4, metadata)) random.shuffle(metadata)
Creating a YOLO model
In [0]:
image_size = 448 backbone = keras_hub.models.Backbone.from_preset( "resnet_50_imagenet", ) preprocessor = keras_hub.layers.ImageConverter.from_preset( "resnet_50_imagenet", image_size=(image_size, image_size), )
In [0]:
from keras import layers grid_size = 6 num_labels = 91 inputs = keras.Input(shape=(image_size, image_size, 3)) x = backbone(inputs) x = layers.Conv2D(512, (3, 3), strides=(2, 2))(x) x = keras.layers.Flatten()(x) x = layers.Dense(2048, activation="relu", kernel_initializer="glorot_normal")(x) x = layers.Dropout(0.5)(x) x = layers.Dense(grid_size * grid_size * (num_labels + 5))(x) x = layers.Reshape((grid_size, grid_size, num_labels + 5))(x) box_predictions = x[..., :5] class_predictions = layers.Activation("softmax")(x[..., 5:]) outputs = {"box": box_predictions, "class": class_predictions} model = keras.Model(inputs, outputs)
In [0]:
model.summary()
Readying the COCO data for the YOLO model
In [0]:
def to_grid(box): x, y, w, h = box cx, cy = (x + w / 2) * grid_size, (y + h / 2) * grid_size ix, iy = int(cx), int(cy) return (ix, iy), (cx - ix, cy - iy, w, h) def from_grid(loc, box): (xi, yi), (x, y, w, h) = loc, box x = (xi + x) / grid_size - w / 2 y = (yi + y) / grid_size - h / 2 return (x, y, w, h)
In [0]:
import numpy as np import math class_array = np.zeros((len(metadata), grid_size, grid_size)) box_array = np.zeros((len(metadata), grid_size, grid_size, 5)) for index, sample in enumerate(metadata): boxes, labels = sample["boxes"], sample["labels"] for box, label in zip(boxes, labels): (x, y, w, h) = box left, right = math.floor(x * grid_size), math.ceil((x + w) * grid_size) bottom, top = math.floor(y * grid_size), math.ceil((y + h) * grid_size) class_array[index, bottom:top, left:right] = label for index, sample in enumerate(metadata): boxes, labels = sample["boxes"], sample["labels"] for box, label in zip(boxes, labels): (xi, yi), (grid_box) = to_grid(box) box_array[index, yi, xi] = [*grid_box, 1.0] class_array[index, yi, xi] = label
In [0]:
def draw_prediction(image, boxes, classes, cutoff=None): fig, ax = plt.subplots(dpi=300) draw_image(ax, image) for yi, row in enumerate(classes): for xi, label in enumerate(row): color = label_to_color(label) if label else "none" x, y, w, h = (v / grid_size for v in (xi, yi, 1.0, 1.0)) r = Rectangle((x, y), w, h, lw=2, ec="black", fc=color, alpha=0.5) ax.add_patch(r) for yi, row in enumerate(boxes): for xi, box in enumerate(row): box, confidence = box[:4], box[4] if not cutoff or confidence >= cutoff: box = from_grid((xi, yi), box) label = classes[yi, xi] color = label_to_color(label) name = keras_hub.utils.coco_id_to_name(label) draw_box(ax, box, f"{name} {max(confidence, 0):.2f}", color) plt.show() draw_prediction(metadata[0]["path"], box_array[0], class_array[0], cutoff=1.0)
In [0]:
import tensorflow as tf def load_image(path): x = tf.io.read_file(path) x = tf.image.decode_jpeg(x, channels=3) return preprocessor(x) images = tf.data.Dataset.from_tensor_slices([x["path"] for x in metadata]) images = images.map(load_image, num_parallel_calls=8) labels = {"box": box_array, "class": class_array} labels = tf.data.Dataset.from_tensor_slices(labels) dataset = tf.data.Dataset.zip(images, labels).batch(16).prefetch(2) val_dataset, train_dataset = dataset.take(500), dataset.skip(500)
Training the YOLO model
In [0]:
from keras import ops def unpack(box): return box[..., 0], box[..., 1], box[..., 2], box[..., 3] def intersection(box1, box2): cx1, cy1, w1, h1 = unpack(box1) cx2, cy2, w2, h2 = unpack(box2) left = ops.maximum(cx1 - w1 / 2, cx2 - w2 / 2) bottom = ops.maximum(cy1 - h1 / 2, cy2 - h2 / 2) right = ops.minimum(cx1 + w1 / 2, cx2 + w2 / 2) top = ops.minimum(cy1 + h1 / 2, cy2 + h2 / 2) return ops.maximum(0.0, right - left) * ops.maximum(0.0, top - bottom) def intersection_over_union(box1, box2): cx1, cy1, w1, h1 = unpack(box1) cx2, cy2, w2, h2 = unpack(box2) intersection_area = intersection(box1, box2) a1 = ops.maximum(w1, 0.0) * ops.maximum(h1, 0.0) a2 = ops.maximum(w2, 0.0) * ops.maximum(h2, 0.0) union_area = a1 + a2 - intersection_area return ops.divide_no_nan(intersection_area, union_area)
In [0]:
def signed_sqrt(x): return ops.sign(x) * ops.sqrt(ops.absolute(x) + keras.config.epsilon()) def box_loss(true, pred): xy_true, wh_true, conf_true = true[..., :2], true[..., 2:4], true[..., 4:] xy_pred, wh_pred, conf_pred = pred[..., :2], pred[..., 2:4], pred[..., 4:] no_object = conf_true == 0.0 xy_error = ops.square(xy_true - xy_pred) wh_error = ops.square(signed_sqrt(wh_true) - signed_sqrt(wh_pred)) iou = intersection_over_union(true, pred) conf_target = ops.where(no_object, 0.0, ops.expand_dims(iou, -1)) conf_error = ops.square(conf_target - conf_pred) error = ops.concatenate( ( ops.where(no_object, 0.0, xy_error * 5.0), ops.where(no_object, 0.0, wh_error * 5.0), ops.where(no_object, conf_error * 0.5, conf_error), ), axis=-1, ) return ops.sum(error, axis=(1, 2, 3))
In [0]:
model.compile( optimizer=keras.optimizers.Adam(2e-4), loss={"box": box_loss, "class": "sparse_categorical_crossentropy"}, ) model.fit( train_dataset, validation_data=val_dataset, epochs=4, )
In [0]:
x, y = next(iter(val_dataset.rebatch(1))) preds = model.predict(x) boxes = preds["box"][0] classes = np.argmax(preds["class"][0], axis=-1) path = metadata[0]["path"] draw_prediction(path, boxes, classes, cutoff=0.1)
In [0]:
draw_prediction(path, boxes, classes, cutoff=None)
Using a pretrained RetinaNet detector
In [0]:
url = ( "https://upload.wikimedia.org/wikipedia/commons/thumb/7/7d/" "A_Sunday_on_La_Grande_Jatte%2C_Georges_Seurat%2C_1884.jpg/" "1280px-A_Sunday_on_La_Grande_Jatte%2C_Georges_Seurat%2C_1884.jpg" ) path = keras.utils.get_file(origin=url) image = np.array([keras.utils.load_img(path)])
In [0]:
detector = keras_hub.models.ObjectDetector.from_preset( "retinanet_resnet50_fpn_v2_coco", bounding_box_format="rel_xywh", ) predictions = detector.predict(image)
In [0]:
[(k, v.shape) for k, v in predictions.items()]
In [0]:
predictions["boxes"][0][0]
In [0]:
fig, ax = plt.subplots(dpi=300) draw_image(ax, path) num_detections = predictions["num_detections"][0] for i in range(num_detections): box = predictions["boxes"][0][i] label = predictions["labels"][0][i] label_name = keras_hub.utils.coco_id_to_name(label) draw_box(ax, box, label_name, label_to_color(label)) plt.show()