# !wget -qN "http://images.cocodataset.org/annotations/annotations_trainval2017.zip"
!wget -qN "https://edunet.kea.su/repo/EduNet-web_dependencies/datasets/annotations_trainval2017.zip"
!unzip -qn annotations_trainval2017.zip


from pycocotools.coco import COCO

coco = COCO("annotations/instances_val2017.json")

loading annotations into memory...
Done (t=0.97s)
creating index...
index created!


catIds = coco.getCatIds(catNms=["cat"])  #  Find category ID by tag
print("class ID(cat) =", catIds)

imgIds = coco.getImgIds(catIds=catIds)  # Filtering dataset by category ID
print("All images: %i" % len(imgIds))

class ID(cat) = [17]
All images: 184


img_list = coco.loadImgs(imgIds[0])  # 1 example
img_metadata = img_list[0]
img_metadata

{'license': 5,
 'file_name': '000000416256.jpg',
 'coco_url': 'http://images.cocodataset.org/val2017/000000416256.jpg',
 'height': 375,
 'width': 500,
 'date_captured': '2013-11-18 04:05:18',
 'flickr_url': 'http://farm1.staticflickr.com/34/64280333_7acf38cfb3_z.jpg',
 'id': 416256}


import requests
from PIL import Image
from io import BytesIO
import matplotlib.pyplot as plt


def coco2pil(url):
    response = requests.get(url)
    return Image.open(BytesIO(response.content))


I = coco2pil(img_metadata["coco_url"])
plt.axis("off")
plt.imshow(I)
plt.show()


cats = coco.loadCats(coco.getCatIds())  # loading categories
num2cat = {}
print("COCO categories: ")

iterator = iter(cats)
cat = next(iterator)
for i in range(0, 91):
    if i == cat["id"]:
        num2cat[cat["id"]] = cat["name"]
        name = cat["name"]
        if i < 90:
            cat = next(iterator)
    else:
        name = "---"

    print(f"{i:2}. {name:20}", end="")

    if not i % 6:
        print("\n")

COCO categories: 
 0. ---                 

 1. person               2. bicycle              3. car                  4. motorcycle           5. airplane             6. bus                 

 7. train                8. truck                9. boat                10. traffic light       11. fire hydrant        12. ---                 

13. stop sign           14. parking meter       15. bench               16. bird                17. cat                 18. dog                 

19. horse               20. sheep               21. cow                 22. elephant            23. bear                24. zebra               

25. giraffe             26. ---                 27. backpack            28. umbrella            29. ---                 30. ---                 

31. handbag             32. tie                 33. suitcase            34. frisbee             35. skis                36. snowboard           

37. sports ball         38. kite                39. baseball bat        40. baseball glove      41. skateboard          42. surfboard           

43. tennis racket       44. bottle              45. ---                 46. wine glass          47. cup                 48. fork                

49. knife               50. spoon               51. bowl                52. banana              53. apple               54. sandwich            

55. orange              56. broccoli            57. carrot              58. hot dog             59. pizza               60. donut               

61. cake                62. chair               63. couch               64. potted plant        65. bed                 66. ---                 

67. dining table        68. ---                 69. ---                 70. toilet              71. ---                 72. tv                  

73. laptop              74. mouse               75. remote              76. keyboard            77. cell phone          78. microwave           

79. oven                80. toaster             81. sink                82. refrigerator        83. ---                 84. book                

85. clock               86. vase                87. scissors            88. teddy bear          89. hair drier          90. toothbrush


print(f"categories[2]: {cats[2]}")
print(f"categories[3]: {cats[3]}")

nms = set([cat["supercategory"] for cat in cats])
print("COCO supercategories: \n{}".format("\t".join(nms)))

categories[2]: {'supercategory': 'vehicle', 'id': 3, 'name': 'car'}
categories[3]: {'supercategory': 'vehicle', 'id': 4, 'name': 'motorcycle'}
COCO supercategories: 
animal	food	person	electronic	outdoor	appliance	indoor	vehicle	kitchen	furniture	sports	accessory


annIds = coco.getAnnIds(imgIds=img_metadata["id"])
anns = coco.loadAnns(annIds)

plt.imshow(I)
plt.axis("off")
coco.showAnns(anns)
plt.show()


def dump_anns(anns):
    for i, a in enumerate(anns):
        print(f"\n#{i}")
        for k in a.keys():
            if k == "category_id" and num2cat.get(a[k], None):
                print(k, ": ", a[k], num2cat[a[k]])  # Show cat. name
            else:
                print(k, ": ", a[k])


dump_anns(anns)

#0
segmentation :  [[157.26, 120.16, 165.32, 104.84, 191.94, 86.29, 198.39, 95.97, 193.55, 103.23, 188.71, 118.55, 188.71, 130.65, 191.13, 140.32, 204.03, 145.16, 211.29, 145.16, 262.1, 150.81, 280.65, 150.81, 290.32, 146.77, 300.0, 138.71, 304.03, 136.29, 322.58, 131.45, 324.19, 131.45, 327.42, 143.55, 341.94, 139.52, 367.74, 144.35, 368.55, 146.77, 337.1, 157.26, 312.9, 163.71, 295.16, 168.55, 299.19, 181.45, 305.65, 201.61, 293.55, 239.52, 288.71, 233.87, 265.32, 227.42, 225.0, 221.77, 199.19, 217.74, 170.97, 211.29, 159.68, 207.26, 149.19, 204.03, 137.1, 195.97, 130.65, 191.13, 117.74, 175.81, 134.68, 150.81, 147.58, 134.68]]
area :  15465.184599999999
iscrowd :  0
image_id :  416256
bbox :  [117.74, 86.29, 250.81, 153.23]
category_id :  17 cat
id :  51846

#1
segmentation :  [[122.48, 272.75, 284.67, 309.77, 293.93, 313.33, 313.86, 252.82, 315.29, 238.58, 153.67, 210.81, 123.06, 259.93, 121.64, 267.77]]
area :  12449.917949999995
iscrowd :  0
image_id :  416256
bbox :  [121.64, 210.81, 193.65, 102.52]
category_id :  76 keyboard
id :  1115755

#2
segmentation :  [[28.95, 191.52, 88.37, 123.57, 138.31, 142.22, 125.98, 155.17, 122.5, 166.87, 122.5, 177.93, 122.82, 182.04, 86.79, 227.87]]
area :  5851.351
iscrowd :  0
image_id :  416256
bbox :  [28.95, 123.57, 109.36, 104.3]
category_id :  76 keyboard
id :  1975408


catIds = coco.getCatIds(catNms=["people"])
annIds = coco.getAnnIds(catIds=catIds, iscrowd=True)
anns = coco.loadAnns(annIds[0:1])

dump_anns(anns)
img = coco.loadImgs(anns[0]["image_id"])[0]
I = coco2pil(img["coco_url"])
plt.imshow(I)
coco.showAnns(anns)  # People in the stands
seg = anns[0]["segmentation"]
print("Counts", len(seg["counts"]))
print("Size", seg["size"])
plt.axis("off")
plt.show()

#0
segmentation :  {'counts': [272, 2, 4, 4, 4, 4, 2, 9, 1, 2, 16, 43, 143, 24, 5, 8, 16, 44, 141, 25, 8, 5, 17, 44, 140, 26, 10, 2, 17, 45, 129, 4, 5, 27, 24, 5, 1, 45, 127, 38, 23, 52, 125, 40, 22, 53, 123, 43, 20, 54, 122, 46, 18, 54, 121, 54, 12, 53, 119, 57, 11, 53, 117, 59, 13, 51, 117, 59, 13, 51, 117, 60, 11, 52, 117, 60, 10, 52, 118, 60, 9, 53, 118, 61, 8, 52, 119, 62, 7, 52, 119, 64, 1, 2, 2, 51, 120, 120, 120, 101, 139, 98, 142, 96, 144, 93, 147, 90, 150, 87, 153, 85, 155, 82, 158, 76, 164, 66, 174, 61, 179, 57, 183, 54, 186, 52, 188, 49, 191, 47, 193, 21, 8, 16, 195, 20, 13, 8, 199, 18, 222, 17, 223, 16, 224, 16, 224, 15, 225, 15, 225, 15, 225, 15, 225, 15, 225, 15, 225, 15, 225, 15, 225, 15, 225, 14, 226, 14, 226, 14, 39, 1, 186, 14, 39, 3, 184, 14, 39, 4, 183, 13, 40, 6, 181, 14, 39, 7, 180, 14, 39, 9, 178, 14, 39, 10, 177, 14, 39, 11, 176, 14, 38, 14, 174, 14, 36, 19, 171, 15, 33, 32, 160, 16, 30, 35, 159, 18, 26, 38, 158, 19, 23, 41, 157, 20, 19, 45, 156, 21, 15, 48, 156, 22, 10, 53, 155, 23, 9, 54, 154, 23, 8, 55, 154, 24, 7, 56, 153, 24, 6, 57, 153, 25, 5, 57, 153, 25, 5, 58, 152, 25, 4, 59, 152, 26, 3, 59, 152, 26, 3, 59, 152, 27, 1, 60, 152, 27, 1, 60, 152, 86, 154, 80, 160, 79, 161, 42, 8, 29, 161, 41, 11, 22, 2, 3, 161, 40, 13, 18, 5, 3, 161, 40, 15, 2, 5, 8, 7, 2, 161, 40, 24, 6, 170, 35, 30, 4, 171, 34, 206, 34, 41, 1, 164, 34, 39, 3, 164, 34, 37, 5, 164, 34, 35, 10, 161, 36, 1, 3, 28, 17, 155, 41, 27, 16, 156, 41, 26, 17, 156, 41, 26, 16, 157, 27, 4, 10, 25, 16, 158, 27, 6, 8, 11, 2, 12, 6, 2, 7, 159, 27, 7, 14, 3, 4, 19, 6, 160, 26, 8, 22, 18, 5, 161, 26, 8, 22, 18, 4, 162, 26, 8, 23, 15, 4, 164, 23, 11, 23, 11, 7, 165, 19, 17, 22, 9, 6, 167, 19, 22, 18, 8, 3, 170, 18, 25, 16, 7, 1, 173, 17, 28, 15, 180, 17, 30, 12, 181, 16, 34, 6, 184, 15, 225, 14, 226, 13, 227, 12, 228, 11, 229, 10, 230, 9, 231, 9, 231, 9, 231, 9, 231, 8, 232, 8, 232, 8, 232, 8, 232, 8, 232, 8, 232, 7, 233, 7, 233, 7, 233, 7, 233, 8, 232, 8, 232, 8, 232, 9, 231, 9, 231, 9, 231, 10, 230, 10, 230, 11, 229, 13, 227, 14, 226, 16, 224, 17, 223, 19, 221, 23, 217, 31, 3, 5, 201, 39, 201, 39, 201, 39, 201, 39, 201, 39, 201, 40, 200, 40, 200, 41, 199, 41, 199, 41, 199, 22, 8, 12, 198, 22, 12, 8, 198, 22, 14, 6, 198, 22, 15, 6, 197, 22, 16, 5, 197, 22, 17, 5, 196, 22, 18, 4, 196, 22, 19, 4, 195, 22, 19, 5, 194, 22, 20, 4, 194, 25, 21, 1, 193, 27, 213, 29, 211, 30, 210, 35, 6, 6, 193, 49, 191, 50, 190, 50, 190, 51, 189, 51, 189, 52, 188, 53, 187, 53, 187, 54, 186, 54, 186, 54, 186, 55, 185, 55, 185, 55, 185, 55, 185, 55, 185, 55, 185, 55, 185, 55, 185, 55, 185, 55, 185, 55, 185, 55, 185, 55, 185, 55, 185, 55, 185, 28, 1, 26, 185, 23, 11, 21, 185, 20, 17, 17, 186, 18, 21, 15, 186, 16, 23, 14, 187, 14, 25, 14, 187, 14, 26, 12, 188, 14, 28, 10, 188, 14, 226, 14, 226, 16, 224, 17, 223, 19, 221, 20, 220, 22, 218, 24, 18, 3, 12, 3, 180, 25, 10, 1, 4, 6, 10, 6, 178, 28, 7, 12, 8, 8, 177, 49, 3, 12, 176, 65, 175, 67, 173, 69, 171, 53, 3, 14, 170, 37, 20, 9, 4, 1, 169, 36, 21, 8, 175, 35, 22, 7, 176, 34, 23, 7, 176, 34, 23, 6, 177, 35, 22, 6, 177, 35, 22, 8, 175, 35, 23, 9, 173, 35, 205, 36, 204, 39, 201, 43, 197, 48, 36, 1, 155, 48, 35, 3, 154, 49, 33, 5, 154, 48, 32, 6, 155, 49, 27, 10, 155, 51, 24, 11, 154, 54, 21, 11, 155, 56, 19, 11, 155, 56, 18, 11, 156, 56, 17, 11, 157, 56, 16, 12, 157, 56, 14, 13, 159, 56, 12, 13, 160, 61, 5, 14, 162, 78, 165, 75, 167, 73, 168, 72, 170, 70, 171, 69, 173, 67, 176, 64, 179, 61, 182, 58, 183, 57, 185, 54, 187, 53, 188, 51, 191, 49, 192, 47, 195, 45, 196, 43, 198, 42, 199, 40, 201, 38, 203, 36, 205, 34, 207, 32, 210, 28, 213, 26, 216, 22, 221, 16, 228, 8, 10250], 'size': [240, 320]}
area :  18419
iscrowd :  1
image_id :  448263
bbox :  [1, 0, 276, 122]
category_id :  1 person
id :  900100448263
Counts 869
Size [240, 320]


import numpy as np

annIds = coco.getAnnIds(imgIds=[448263])
anns = coco.loadAnns(annIds)
msk = np.zeros(seg["size"])

fig, ax = plt.subplots(nrows=4, ncols=4, figsize=(10, 10))


i = 0
for row in range(4):
    for col in range(4):
        ann = anns[i]
        msk = coco.annToMask(ann)
        ax[row, col].imshow(msk, cmap="gray")
        ax[row, col].set_title(num2cat[anns[i]["category_id"]])
        ax[row, col].axis("off")
        i += 1

plt.show()


from PIL import ImageDraw

annIds = coco.getAnnIds(imgIds=[448263])
anns = coco.loadAnns(annIds)
draw = ImageDraw.Draw(I)

colors = {1: "white", 40: "lime"}  # person - white, glove - lime
for ann in anns:
    x, y, width, heigth = ann["bbox"]  # bounding box here
    color = colors.get(ann["category_id"], None)
    if color:
        draw.rectangle((x, y, x + width, y + heigth), outline=color, width=2)
plt.imshow(I)
plt.show()


# !wget -qN "http://images.cocodataset.org/annotations/annotations_trainval2017.zip"
!wget -qN "https://edunet.kea.su/repo/EduNet-web_dependencies/datasets/annotations_trainval2017.zip"
!unzip -qn annotations_trainval2017.zip


import requests
import numpy as np
import matplotlib.pyplot as plt
from io import BytesIO
from PIL import Image
from pycocotools.coco import COCO
from IPython.display import clear_output


def coco2pil(url):
    response = requests.get(url)
    return Image.open(BytesIO(response.content))


coco = COCO("annotations/instances_val2017.json")
clear_output()

annIds = coco.getAnnIds(imgIds=[448263])
anns = coco.loadAnns(annIds)
img = coco.loadImgs(anns[0]["image_id"])[0]
I = coco2pil(img["coco_url"])

semantic_seg_person_mask = np.zeros(I.size[::-1], dtype=bool)  # WxH -> HxW

for ann in anns:
    msk = coco.annToMask(ann)  # HxW
    if ann["category_id"] == 1 and not ann["iscrowd"]:  # single person:
        # semantic_seg_person_mask = msk | semantic_seg_person_mask  # union
        semantic_seg_person_mask += msk.astype(bool)

semantic_seg_person_mask = semantic_seg_person_mask > 0  # binarize
plt.imshow(semantic_seg_person_mask, cmap="gray")
plt.show()


import torch

last_layer_output = torch.randn((3, 32, 32))  # class_num, W,H
print("Output of last layer shape", last_layer_output.shape)  # activation slice
mask = torch.argmax(last_layer_output, dim=0)  # class_nums prediction
print("One class mask shape", mask.shape)
print("Predictions for all classes \n", mask[:5, :5])

Output of last layer shape torch.Size([3, 32, 32])
One class mask shape torch.Size([32, 32])
Predictions for all classes 
 tensor([[2, 2, 0, 1, 2],
        [2, 1, 0, 0, 0],
        [0, 1, 1, 1, 1],
        [2, 0, 0, 0, 2],
        [0, 1, 1, 0, 2]])


import seaborn as sns


def img_to_heatmap(img, ax, title):  # Magik method to show img as heatmap
    ax.axis("off")
    ax.set_title(title)
    array = np.array(img)
    array = array[None, None, :]
    sns.heatmap(array[0][0], annot=True, ax=ax, lw=1, cbar=False)


# Fake image
raw = np.array([[1, 3, 0, 1], [3, 3, 3, 7], [8, 1, 8, 7], [6, 1, 1, 1]], dtype=np.uint8)
pil = Image.fromarray(raw)

interp_nn = pil.resize((8, 8), resample=Image.NEAREST)
interp_bl = pil.resize((8, 8), resample=Image.BILINEAR)

# plot result
fig, ax = plt.subplots(ncols=3, figsize=(15, 5), sharex=True, sharey=True)
img_to_heatmap(raw, ax[0], "Raster dataset")
img_to_heatmap(interp_nn, ax[1], "Nearest neighbor interpolation")
img_to_heatmap(interp_bl, ax[2], "Bilinear interpolation")
plt.show()


!wget -q https://edunet.kea.su/repo/EduNet-content/dev-2.0/L11/out/semantic_segmentation_1.png -O cat.png


from torch import nn
import torchvision.transforms.functional as TF


def upsample(pil, ax, mode="nearest"):
    tensor = TF.to_tensor(pil)
    # Create upsample instance

    if mode == "nearest":
        upsampler = nn.Upsample(scale_factor=2, mode=mode)
    else:
        upsampler = nn.Upsample(scale_factor=2, mode=mode, align_corners=True)

    tensor_128 = upsampler(tensor.unsqueeze(0))  # add batch dimension
    # Convert tensor to pillow
    img_128 = tensor_128.squeeze()
    img_128_pil = TF.to_pil_image(img_128.clamp(min=0, max=1))
    ax.imshow(img_128_pil)
    ax.set_title(mode)


# Load and show image in Pillow format
pic = Image.open("cat.png")
pil_64 = pic.resize((64, 64))
fig, ax = plt.subplots(ncols=4, figsize=(15, 5))
ax[0].imshow(pil_64)
ax[0].set_title("Raw")

# Upsample with Pytorch
upsample(pil_64, mode="nearest", ax=ax[1])
upsample(pil_64, mode="bilinear", ax=ax[2])
upsample(pil_64, mode="bicubic", ax=ax[3])
plt.show()


# fmt: off

model = nn.Sequential(
    nn.Upsample(scale_factor=2),
    nn.Conv2d(3, 16, kernel_size=3, padding=1),
    nn.ReLU()
)

# fmt: on

dummy_input = torch.randn((0, 3, 32, 32))
out = model(dummy_input)
print(out.shape)

torch.Size([0, 16, 64, 64])


torch.use_deterministic_algorithms(False, warn_only=False)


def coco2pil(url):
    print(url)
    response = requests.get(url)
    return Image.open(BytesIO(response.content))


def tensor_show(tensor, title="", ax=ax):
    img = TF.to_pil_image(tensor.squeeze()).convert("RGB")
    ax.set_title(title + str(img.size))
    ax.imshow(img)


pool = nn.MaxPool2d(
    kernel_size=2, return_indices=True
)  # False by default(get indexes to upsample)
unpool = nn.MaxUnpool2d(kernel_size=2)

pil = coco2pil("http://images.cocodataset.org/val2017/000000448263.jpg")

fig, ax = plt.subplots(ncols=5, figsize=(20, 5), sharex=True, sharey=True)

ax[0].set_title("original " + str(pil.size))
ax[0].imshow(pil)
tensor = TF.to_tensor(pil).unsqueeze(0)
print("Orginal shape", tensor.shape)

# Downsample
tensor_half_res, indexes1 = pool(tensor)
tensor_show(tensor_half_res, "1/2 down ", ax=ax[1])

tensor_q_res, indexes2 = pool(tensor_half_res)
tensor_show(tensor_q_res, "1/4 down ", ax=ax[2])
print("Downsample shape", indexes2.shape)

# Upsample
tensor_half_res1 = unpool(tensor_q_res, indexes2)
tensor_show(tensor_half_res1, "1/2 up ", ax=ax[3])


tensor_recovered = unpool(tensor_half_res1, indexes1)
tensor_show(tensor_recovered, "full size up ", ax=ax[4])
print("Upsample shape", tensor_recovered.shape)
plt.show()

http://images.cocodataset.org/val2017/000000448263.jpg
Orginal shape torch.Size([1, 3, 240, 320])
Downsample shape torch.Size([1, 3, 60, 80])
Upsample shape torch.Size([1, 3, 240, 320])


input = torch.randn(1, 16, 16, 16)  # define dummy input
print("Original size", input.shape)

downsample = nn.Conv2d(16, 16, 3, stride=2, padding=1)  # define downsample layer
upsample = nn.ConvTranspose2d(16, 16, 3, stride=2, padding=1)  # define upsample layer

# let`s downsample and upsample input
with torch.no_grad():
    output_1 = downsample(input)
    print("Downsampled size", output_1.size())

    output_2 = upsample(output_1, output_size=input.size())
    print("Upsampled size", output_2.size())

# plot results
fig, ax = plt.subplots(ncols=3, figsize=(15, 5), sharex=True, sharey=True)
sns.heatmap(input[0, 0, :, :], ax=ax[0], cbar=False, vmin=-2, vmax=2)
ax[0].set_title("Input")
sns.heatmap(output_1[0, 0, :, :], ax=ax[1], cbar=False, vmin=-2, vmax=2)
ax[1].set_title("Downsampled")
sns.heatmap(output_2[0, 0, :, :], ax=ax[2], cbar=False, vmin=-2, vmax=2)
ax[2].set_title("Upsampled")
plt.show()

Original size torch.Size([1, 16, 16, 16])
Downsampled size torch.Size([1, 16, 8, 8])
Upsampled size torch.Size([1, 16, 16, 16])


import torchvision
from torchvision import transforms


def coco2pil(url):
    print(url)
    response = requests.get(url)
    return Image.open(BytesIO(response.content))


# load resnet50
fcn_model = torchvision.models.segmentation.fcn_resnet50(
    weights="FCN_ResNet50_Weights.DEFAULT", num_classes=21
)

classes = [
    "__background__",
    "aeroplane",
    "bicycle",
    "bird",
    "boat",
    "bottle",
    "bus",
    "car",
    "cat",
    "chair",
    "cow",
    "diningtable",
    "dog",
    "horse",
    "motorbike",
    "person",
    "pottedplant",
    "sheep",
    "sofa",
    "train",
    "tvmonitor",
]

transform = transforms.Compose(
    [
        transforms.ToTensor(),
        transforms.Normalize(
            mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
        ),  # ImageNet
    ]
)

pil_img = coco2pil("http://images.cocodataset.org/val2017/000000448263.jpg")
input_tensor = transform(pil_img)

with torch.no_grad():
    output = fcn_model(input_tensor.unsqueeze(0))

Downloading: "https://download.pytorch.org/models/fcn_resnet50_coco-1167a1af.pth" to /root/.cache/torch/hub/checkpoints/fcn_resnet50_coco-1167a1af.pth
100%|██████████| 135M/135M [00:01<00:00, 101MB/s]

http://images.cocodataset.org/val2017/000000448263.jpg


print("output keys: ", output.keys())  # Ordered dictionary
print("out: ", output["out"].shape, "Batch, class_num, h, w")

output_predictions = output["out"][0].argmax(0)  # for first element of batch
print(f"output_predictions: {output_predictions.shape}")

fig = plt.figure(figsize=(10, 10))
plt.imshow(pil_img)
plt.axis("off")
plt.show()

indexes = output_predictions
semantic_seg_person_predict = np.zeros(pil_img.size).astype(bool)

# plot all classes predictions
fig, ax = plt.subplots(nrows=4, ncols=5, figsize=(10, 10))
i = 0  # counter
for row in range(4):
    for col in range(5):
        mask = torch.zeros(indexes.shape)
        mask[indexes == i] = 255

        ax[row, col].set_title(classes[i])
        ax[row, col].imshow(mask)
        ax[row, col].axis("off")
        i += 1

plt.show()

output keys:  odict_keys(['out', 'aux'])
out:  torch.Size([1, 21, 240, 320]) Batch, class_num, h, w
output_predictions: torch.Size([240, 320])


semantic_seg_person_predict = torch.zeros(indexes.shape)
semantic_seg_person_predict[indexes == 15] = 1  # to obtain binary mask
semantic_seg_person_predict = (
    semantic_seg_person_predict.numpy()
)  # for skliarn compability

plt.imshow(semantic_seg_person_predict, cmap="gray")
plt.show()


import matplotlib.pyplot as plt


plt.figure(figsize=(15, 10))

plt.subplot(1, 3, 1)
plt.axis("off")
plt.title("GT mask")
plt.imshow(semantic_seg_person_mask)

plt.subplot(1, 3, 2)
plt.axis("off")
plt.title("Predicted mask")
plt.imshow(semantic_seg_person_predict)

plt.subplot(1, 3, 3)
plt.title("GT & Predict overlap")
plt.axis("off")
tmp = semantic_seg_person_predict * 2 + semantic_seg_person_mask
plt.imshow(tmp)
plt.show()


from sklearn.metrics import jaccard_score

# wait vectors, so we flatten the data
y_true = semantic_seg_person_mask.flatten()
y_pred = semantic_seg_person_predict.flatten()
iou = jaccard_score(y_true, y_pred)

print(f"IoU = {iou:.2f}")

IoU = 0.49


from sklearn.metrics import confusion_matrix

# https://stackoverflow.com/questions/31324218/scikit-learn-how-to-obtain-true-positive-true-negative-false-positive-and-fal
# or use:  smp.metrics.get_stats
# https://smp.readthedocs.io/en/latest/metrics.html#segmentation_models_pytorch.metrics.functional.get_stats
tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[0, 1]).ravel()
iou = tp / (tp + fp + fn)
print(f"IoU = {iou:.2f}")

IoU = 0.49


from sklearn.metrics import f1_score

dice_smp = f1_score(y_true, y_pred)
dice = 2 * tp / (2 * tp + fp + fn)
print(f"Dice = {dice:.2f}")
print(f"F1_score = {dice_smp:.2f}")

Dice = 0.66
F1_score = 0.66


import torch

mask_class_1 = torch.randint(0, 2, (1, 64, 64))  # [0 , 1]
one_class_out = torch.randn(1, 1, 64, 64)
print(mask_class_1.shape)
print(one_class_out.shape)

torch.Size([1, 64, 64])
torch.Size([1, 1, 64, 64])


from torch import nn

bce_loss_wl = nn.BCEWithLogitsLoss()  # Sigmoid inside
loss = bce_loss_wl(
    one_class_out, mask_class_1.float().unsqueeze(0)
)  # both params must have equal size
print(loss)

tensor(0.8207)


norm_one_class_out = one_class_out.sigmoid()

bce_loss = nn.BCELoss()
loss = bce_loss(
    norm_one_class_out, mask_class_1.float().unsqueeze(0)
)  # both params must have equal size
print(loss)

tensor(0.8207)


cross_entropy = nn.CrossEntropyLoss()
loss = cross_entropy(one_class_out, mask_class_1.float().unsqueeze(0))
print(loss)

tensor(-0.)


one_class_out.softmax(dim=1).unique()

tensor([1.])


mask_class1 = torch.randint(0, 2, (1, 64, 64))  # [0 , 1]
mask_class2 = torch.randint(0, 2, (1, 64, 64))

target = torch.cat((mask_class1, mask_class2))

print(target.shape)

torch.Size([2, 64, 64])


two_class_out = torch.randn(1, 2, 64, 64)
print(two_class_out.shape)

torch.Size([1, 2, 64, 64])


# https://pytorch.org/docs/stable/generated/torch.nn.BCEWithLogitsLoss.html

bce_loss = nn.BCEWithLogitsLoss()  # Sigmoid inside
float_target = target.float()  # add batch and convert ot float
loss = bce_loss(
    two_class_out, float_target.unsqueeze(0)
)  # both params must have equal size
print(loss)

tensor(0.8110)


cross_entropy = nn.CrossEntropyLoss()
# If containing class probabilities, same shape as the input and each value should be between [0,1][0,1].
loss = cross_entropy(two_class_out, float_target.unsqueeze(0))
print(loss)

tensor(0.9136)


sq_target = target.argmax(0)
sq_target.shape

torch.Size([64, 64])


# https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html
cross_entropy = nn.CrossEntropyLoss()

loss = cross_entropy(two_class_out, sq_target.unsqueeze(0))
print(loss)

tensor(0.9258)


!pip install -q segmentation-models-pytorch

clear_output()


import segmentation_models_pytorch as smp

iou_loss = smp.losses.JaccardLoss(smp.losses.MULTILABEL_MODE, from_logits=True)
print("IoU Loss", iou_loss(two_class_out, float_target.unsqueeze(0)))

IoU Loss tensor(0.6699)


dice_loss = smp.losses.DiceLoss(smp.losses.MULTILABEL_MODE, from_logits=True)

print(two_class_out.shape, target.shape)
print("DICE Loss", dice_loss(two_class_out, float_target.unsqueeze(0)))

torch.Size([1, 2, 64, 64]) torch.Size([2, 64, 64])
DICE Loss tensor(0.5037)


from IPython.display import clear_output

!pip install -q segmentation-models-pytorch
clear_output()


import torch
import segmentation_models_pytorch as smp
from segmentation_models_pytorch.encoders import get_preprocessing_fn

# 'mit_b0' Mix Vision Transformer Backbone from SegFormer pretrained on Imagenet
preprocess_input = get_preprocessing_fn("mit_b0", pretrained="imagenet")

# MixVisionTransformer encoder does not support in_channels setting other than 3
# supported by FPN only for encoder depth = 5
model = smp.FPN("mit_b0", in_channels=3, classes=10, encoder_depth=5)

# ... Train model on your dataset

dummy_input = torch.randn([1, 3, 64, 64])

mask = model(dummy_input)
clear_output()
print(mask.shape)  # torch.Size([1, 1, 64, 64])

torch.Size([1, 10, 64, 64])


import timm

model_names = timm.list_models(pretrained=True)
print("Total pretrained models: ", len(model_names))

Total pretrained models:  1163


model_names = timm.list_models("*mobilenet*small*")
print(model_names)

['mobilenetv3_small_050', 'mobilenetv3_small_075', 'mobilenetv3_small_100', 'tf_mobilenetv3_small_075', 'tf_mobilenetv3_small_100', 'tf_mobilenetv3_small_minimal_100']


from warnings import simplefilter

simplefilter("ignore", UserWarning)

timm_mobilenet = timm.create_model("mobilenetv3_small_050", pretrained=True)

model.safetensors:   0%|          | 0.00/6.42M [00:00<?, ?B/s]


out = timm_mobilenet(dummy_input)
print(out.shape)

torch.Size([1, 1000])


smp_timm_model = smp.DeepLabV3("tu-mobilenetv3_small_050", in_channels=3, classes=80)
smp_timm_model.eval()
print("Created DeepLab with mobileNet encoder")

Created DeepLab with mobileNet encoder


mask = smp_timm_model(dummy_input)
print(mask.shape)

torch.Size([1, 80, 64, 64])


import torch
import random
import numpy as np


def fix_seeds():
    torch.manual_seed(42)
    random.seed(42)
    np.random.seed(42)


fix_seeds()


from torch import nn
from torchvision.models import resnet18

# load pretrained model
resnet_detector = resnet18(weights="ResNet18_Weights.DEFAULT")

# Change "head" to predict coordinates (x1,y1 x2,y2)
resnet_detector.fc = nn.Linear(resnet_detector.fc.in_features, 4)  # x1,y1 x2,y2

Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to /root/.cache/torch/hub/checkpoints/resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 138MB/s]


criterion = nn.MSELoss()

# This is a random example. Don't expect good results
input = torch.rand((1, 3, 224, 224))
target = torch.tensor([[0.1, 0.1, 0.5, 0.5]])  # x1,y1 x2,y2 or x,y w,h
print(f"Target: {target}")
output = resnet_detector(input)
loss = criterion(output, target)
print(f"Output: {output}")
print(f"Loss: {loss}")

Target: tensor([[0.1000, 0.1000, 0.5000, 0.5000]])
Output: tensor([[ 0.0432,  0.5333, -0.1924,  0.8475]], grad_fn=<AddmmBackward0>)
Loss: 0.19780868291854858


resnet_detector.fc = nn.Sequential(
    nn.Linear(resnet_detector.fc.in_features, 4), nn.Sigmoid()
)

resnet_detector(input)

tensor([[0.2782, 0.4618, 0.3732, 0.3802]], grad_fn=<SigmoidBackward0>)

Swin Transformer (2021)¶


from IPython.display import clear_output

!pip install -q super_gradients
clear_output()


# !wget -q https://github.com/ultralytics/assets/releases/download/v0.0.0/yolo_nas_s.pt
!wget -q https://edunet.kea.su/EduNet-web_dependencies/weights/yolo_nas_s.pt


from super_gradients.training import models
from super_gradients.common.object_names import Models

clear_output()

yolo = models.get(Models.YOLO_NAS_M, pretrained_weights="coco")

[2024-02-16 12:59:26] INFO - checkpoint_utils.py - License Notification: YOLO-NAS pre-trained weights are subjected to the specific license terms and conditions detailed in 
https://github.com/Deci-AI/super-gradients/blob/master/LICENSE.YOLONAS.md
By downloading the pre-trained weight files you agree to comply with these terms.
Downloading: "https://sghub.deci.ai/models/yolo_nas_m_coco.pth" to /root/.cache/torch/hub/checkpoints/yolo_nas_m_coco.pth
100%|██████████| 196M/196M [00:05<00:00, 37.5MB/s]
[2024-02-16 12:59:33] INFO - checkpoint_utils.py - Successfully loaded pretrained weights for architecture yolo_nas_m


"""
  Important note - When using batched input (4-dimensional np.ndarray or
  torch.Tensor) formats, normalization and size preprocessing will be applied
  to these inputs. This means that the input tensors should not be normalized
  beforehand. Here is the example of incorrect code of using model.predict()
"""

results = yolo.predict(
    "http://images.cocodataset.org/val2017/000000448263.jpg",  # baseball
    conf=0.25,  # for NMS
    iou=0.7,  # for NMS
)

[2024-02-16 12:59:36] INFO - pipelines.py - Fusing some of the model's layers. If this takes too much memory, you can deactivate it by setting `fuse_model=False`


# The model.predict() method returns an ImagesDetectionPrediction object, which contains the detection results for each image.

print(results.prediction)  # contains detections for one image

DetectionPrediction(bboxes_xyxy=array([[ 18.093056 ,  11.2314825,  81.05606  , 227.98335  ],
       [ 16.734737 , 122.68073  ,  54.112534 , 171.00551  ],
       [ 79.94884  ,   7.152906 , 231.75941  , 229.11546  ],
       [ 81.45465  ,   7.311383 , 302.19525  , 229.49275  ],
       [150.41779  , 176.44623  , 304.30032  , 230.20758  ],
       [197.51967  , 179.5092   , 302.5491   , 230.1062   ],
       [ 54.98115  , 127.76092  ,  97.13468  , 205.39932  ],
       [150.78186  , 177.38443  , 177.65022  , 210.23045  ]],
      dtype=float32), confidence=array([0.9516091 , 0.92642736, 0.7283859 , 0.5923151 , 0.29434118,
       0.2638578 , 0.26014116, 0.25849485], dtype=float32), labels=array([ 0, 35,  0,  0,  0,  0,  2, 35]))


image_prediction = results

class_names = image_prediction.class_names
labels = image_prediction.prediction.labels
confidence = image_prediction.prediction.confidence
bboxes = image_prediction.prediction.bboxes_xyxy

print("Classes", class_names)
print("Labels shape", labels.shape)
print("Confidence  shape", confidence.shape)
print("BBoxes  shape", bboxes.shape)  # xyxy

Classes ('person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush')
Labels shape (8,)
Confidence  shape (8,)
BBoxes  shape (8, 4)


print(bboxes)

[[ 18.093056   11.2314825  81.05606   227.98335  ]
 [ 16.734737  122.68073    54.112534  171.00551  ]
 [ 79.94884     7.152906  231.75941   229.11546  ]
 [ 81.45465     7.311383  302.19525   229.49275  ]
 [150.41779   176.44623   304.30032   230.20758  ]
 [197.51967   179.5092    302.5491    230.1062   ]
 [ 54.98115   127.76092    97.13468   205.39932  ]
 [150.78186   177.38443   177.65022   210.23045  ]]


image_prediction.show(box_thickness=2, show_confidence=True)


!wget -q "http://images.cocodataset.org/val2017/000000448263.jpg"


from PIL import Image
from torchvision.transforms import ToTensor

I = Image.open("000000448263.jpg")
t = ToTensor()(I)


from IPython.display import clear_output
from torchvision.models.detection import (
    maskrcnn_resnet50_fpn,
    MaskRCNN_ResNet50_FPN_Weights,
)


mask_rcnn = maskrcnn_resnet50_fpn(weights=MaskRCNN_ResNet50_FPN_Weights.DEFAULT)
clear_output()
mask_rcnn.eval()
predictions = mask_rcnn(t.unsqueeze(0))


print(predictions[0].keys())

dict_keys(['boxes', 'labels', 'scores', 'masks'])


predictions[0]["masks"].shape

torch.Size([61, 1, 240, 320])


import matplotlib.pyplot as plt


def show_masks(masks, classes=None):
    fig, ax = plt.subplots(nrows=4, ncols=8, figsize=(20, 10))
    i = 0
    for row in range(4):
        for col in range(8):
            if classes is not None:
                ax[row, col].set_title(int(classes[i]))
            ax[row, col].imshow(masks[i])
            ax[row, col].axis("off")
            i += 1
            if i >= len(masks):
                plt.show()
                return

    plt.show()


show_masks(
    predictions[0]["masks"].detach().squeeze(1), predictions[0]["labels"].detach()
)


mask = predictions[0]["masks"][2].detach().squeeze(0)
mask = mask > 0.5
plt.imshow(mask, cmap="gray")
plt.show()


!pip install -q ultralytics

     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 715.0/715.0 kB 7.5 MB/s eta 0:00:00


from ultralytics import YOLO

yolo_seg = YOLO("yolov8m-seg.pt")

Downloading https://github.com/ultralytics/assets/releases/download/v8.1.0/yolov8m-seg.pt to 'yolov8m-seg.pt'...

100%|██████████| 52.4M/52.4M [00:00<00:00, 168MB/s]


seg_results = yolo_seg(I)

0: 480x640 4 persons, 1 baseball glove, 247.2ms
Speed: 22.0ms preprocess, 247.2ms inference, 1380.4ms postprocess per image at shape (1, 3, 480, 640)


seg_results[0].masks.shape

torch.Size([5, 480, 640])


seg_results[0].masks.data.shape

torch.Size([5, 480, 640])


seg_results[0].boxes.cls

tensor([35.,  0.,  0.,  0.,  0.], device='cuda:0')


plt.figure(figsize=(10, 6))
pil_with_bbox = seg_results[0].plot(boxes=True, masks=True)
plt.imshow(pil_with_bbox[..., ::-1])  # BGR?
plt.show()


!pip install -q transformers


import torch
from transformers import Owlv2Processor, Owlv2ForObjectDetection
from IPython.display import clear_output

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

processor = Owlv2Processor.from_pretrained("google/owlv2-base-patch16-ensemble")

model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble")

model.eval()
model.to(device)
print(" ")
clear_output()
print("loaded")  # suppress huge output of model structure


!wget -qN "http://images.cocodataset.org/val2017/000000448263.jpg"


from PIL import Image


img = Image.open("000000448263.jpg")
img


texts = ["cap", "botle", "text", "boy", "player"]  #


batch = [img]
batch_size = len(batch)
with torch.inference_mode():
    inputs = processor(
        text=texts * batch_size,  # copy the same text for all images
        images=[img],  # batch of images in PIL format
        return_tensors="pt",
    )
    inputs = inputs.to(device)
    outputs = model(**inputs)  # return object of type OwlViTObjectDetectionOutput


print("bboxes", outputs["pred_boxes"].shape)  # batch * total_boxes * 4 coords per box
print("logits", outputs["logits"].shape)  # batch * total_boxes * text_count

bboxes torch.Size([1, 3600, 4])
logits torch.Size([1, 3600, 5])


size = img.size[::-1]  # WH -> HW
target_sizes = torch.tensor([size])
print(target_sizes)

tensor([[240, 320]])


results = processor.post_process_object_detection(
    threshold=0.1,  # drop bbox with scores below 0.1
    outputs=outputs,
    target_sizes=target_sizes.to(device),
)

print(results)

[{'scores': tensor([0.3007, 0.2715, 0.5686, 0.4415], device='cuda:0'), 'labels': tensor([0, 0, 3, 4], device='cuda:0'), 'boxes': tensor([[ 96.9493,   5.3616, 147.0229,  24.2409],
        [ 35.6500,   8.5195,  81.3124,  28.4498],
        [ 20.2118,   8.5109,  80.1002, 172.0828],
        [ 78.7046,   5.6050, 301.0884, 173.4483]], device='cuda:0')}]


import numpy as np
import matplotlib.pyplot as plt
from torchvision.ops import box_convert
from torchvision.utils import draw_bounding_boxes


def draw_bbox(img, bb, color, labels=None, xywh=True):
    t_img = torch.tensor(img).permute(2, 0, 1)  # to tensor CHW
    bb = bb[:, :4].clone().detach()  # take only coords
    if xywh:
        bb = box_convert(bb, "xywh", "xyxy")  # convert from COCO format
    img_with_bb = draw_bounding_boxes(t_img, bb, colors=color, width=2, labels=labels)
    return img_with_bb.permute(1, 2, 0).numpy()  # back to numpy HWC


def show_owl_results(results):
    labels = []
    for i, text_idx in enumerate(results[0]["labels"]):
        labels.append(texts[text_idx] + f" {results[0]['scores'][i]:.2f}")
    image_with_bb = draw_bbox(
        np.array(img), results[0]["boxes"], "lime", labels=labels, xywh=False
    )
    plt.figure(figsize=(8, 6))
    plt.imshow(image_with_bb)
    plt.show()


show_owl_results(results)


coarse_results = processor.post_process_object_detection(
    threshold=0.07,  # change threshold
    outputs=outputs,
    target_sizes=target_sizes.to(device),
)

show_owl_results(coarse_results)


!pip install -q git+https://github.com/facebookresearch/segment-anything.git

  Preparing metadata (setup.py) ... done
  Building wheel for segment-anything (setup.py) ... done


# ViT-H
# !wget -qN https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth
!wget -qN https://edunet.kea.su/repo/EduNet-web_dependencies/weights/sam_vit_h_4b8939.pth


import torch
from segment_anything import sam_model_registry

# model_type = 'vit_h'
sam = sam_model_registry["vit_h"](checkpoint="sam_vit_h_4b8939.pth")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
sam.to(device=device)

Sam(
  (image_encoder): ImageEncoderViT(
    (patch_embed): PatchEmbed(
      (proj): Conv2d(3, 1280, kernel_size=(16, 16), stride=(16, 16))
    )
    (blocks): ModuleList(
      (0-31): 32 x Block(
        (norm1): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=1280, out_features=3840, bias=True)
          (proj): Linear(in_features=1280, out_features=1280, bias=True)
        )
        (norm2): LayerNorm((1280,), eps=1e-06, elementwise_affine=True)
        (mlp): MLPBlock(
          (lin1): Linear(in_features=1280, out_features=5120, bias=True)
          (lin2): Linear(in_features=5120, out_features=1280, bias=True)
          (act): GELU(approximate='none')
        )
      )
    )
    (neck): Sequential(
      (0): Conv2d(1280, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (1): LayerNorm2d()
      (2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (3): LayerNorm2d()
    )
  )
  (prompt_encoder): PromptEncoder(
    (pe_layer): PositionEmbeddingRandom()
    (point_embeddings): ModuleList(
      (0-3): 4 x Embedding(1, 256)
    )
    (not_a_point_embed): Embedding(1, 256)
    (mask_downscaling): Sequential(
      (0): Conv2d(1, 4, kernel_size=(2, 2), stride=(2, 2))
      (1): LayerNorm2d()
      (2): GELU(approximate='none')
      (3): Conv2d(4, 16, kernel_size=(2, 2), stride=(2, 2))
      (4): LayerNorm2d()
      (5): GELU(approximate='none')
      (6): Conv2d(16, 256, kernel_size=(1, 1), stride=(1, 1))
    )
    (no_mask_embed): Embedding(1, 256)
  )
  (mask_decoder): MaskDecoder(
    (transformer): TwoWayTransformer(
      (layers): ModuleList(
        (0-1): 2 x TwoWayAttentionBlock(
          (self_attn): Attention(
            (q_proj): Linear(in_features=256, out_features=256, bias=True)
            (k_proj): Linear(in_features=256, out_features=256, bias=True)
            (v_proj): Linear(in_features=256, out_features=256, bias=True)
            (out_proj): Linear(in_features=256, out_features=256, bias=True)
          )
          (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (cross_attn_token_to_image): Attention(
            (q_proj): Linear(in_features=256, out_features=128, bias=True)
            (k_proj): Linear(in_features=256, out_features=128, bias=True)
            (v_proj): Linear(in_features=256, out_features=128, bias=True)
            (out_proj): Linear(in_features=128, out_features=256, bias=True)
          )
          (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (mlp): MLPBlock(
            (lin1): Linear(in_features=256, out_features=2048, bias=True)
            (lin2): Linear(in_features=2048, out_features=256, bias=True)
            (act): ReLU()
          )
          (norm3): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (norm4): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (cross_attn_image_to_token): Attention(
            (q_proj): Linear(in_features=256, out_features=128, bias=True)
            (k_proj): Linear(in_features=256, out_features=128, bias=True)
            (v_proj): Linear(in_features=256, out_features=128, bias=True)
            (out_proj): Linear(in_features=128, out_features=256, bias=True)
          )
        )
      )
      (final_attn_token_to_image): Attention(
        (q_proj): Linear(in_features=256, out_features=128, bias=True)
        (k_proj): Linear(in_features=256, out_features=128, bias=True)
        (v_proj): Linear(in_features=256, out_features=128, bias=True)
        (out_proj): Linear(in_features=128, out_features=256, bias=True)
      )
      (norm_final_attn): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    )
    (iou_token): Embedding(1, 256)
    (mask_tokens): Embedding(4, 256)
    (output_upscaling): Sequential(
      (0): ConvTranspose2d(256, 64, kernel_size=(2, 2), stride=(2, 2))
      (1): LayerNorm2d()
      (2): GELU(approximate='none')
      (3): ConvTranspose2d(64, 32, kernel_size=(2, 2), stride=(2, 2))
      (4): GELU(approximate='none')
    )
    (output_hypernetworks_mlps): ModuleList(
      (0-3): 4 x MLP(
        (layers): ModuleList(
          (0-1): 2 x Linear(in_features=256, out_features=256, bias=True)
          (2): Linear(in_features=256, out_features=32, bias=True)
        )
      )
    )
    (iou_prediction_head): MLP(
      (layers): ModuleList(
        (0-1): 2 x Linear(in_features=256, out_features=256, bias=True)
        (2): Linear(in_features=256, out_features=4, bias=True)
      )
    )
  )
)


!wget -q "http://images.cocodataset.org/val2017/000000448263.jpg"


import numpy as np
from PIL import Image

img = Image.open("000000448263.jpg")
np_im = np.array(img)  # HWC format
img


%%time
from segment_anything import SamAutomaticMaskGenerator

mask_generator = SamAutomaticMaskGenerator(sam)
masks = mask_generator.generate(np_im)

CPU times: user 6.06 s, sys: 24.3 ms, total: 6.08 s
Wall time: 6.31 s


masks[0]

{'segmentation': array([[False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        ...,
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False],
        [False, False, False, ..., False, False, False]]),
 'area': 5943,
 'bbox': [104, 40, 108, 82],
 'predicted_iou': 1.0488085746765137,
 'point_coords': [[175.0, 86.25]],
 'stability_score': 0.9864819645881653,
 'crop_box': [0, 0, 320, 240]}


masks[0]["segmentation"].shape

(240, 320)


# https://github.com/facebookresearch/segment-anything/blob/main/notebooks/automatic_mask_generator_example.ipynb
import matplotlib.pyplot as plt


def show_anns(anns):
    if len(anns) == 0:
        return
    sorted_anns = sorted(anns, key=(lambda x: x["area"]), reverse=True)
    ax = plt.gca()
    ax.set_autoscale_on(False)

    img = np.ones(
        (
            sorted_anns[0]["segmentation"].shape[0],
            sorted_anns[0]["segmentation"].shape[1],
            4,
        )
    )
    img[:, :, 3] = 0
    for ann in sorted_anns:
        m = ann["segmentation"]
        color_mask = np.concatenate([np.random.random(3), [0.35]])
        img[m] = color_mask
    ax.imshow(img)


plt.figure(figsize=(10, 8))
plt.imshow(img)
show_anns(masks)
plt.axis("off")
plt.show()


%%time

from segment_anything import SamPredictor


predictor = SamPredictor(sam)
predictor.set_image(np_im)  # create embedding

CPU times: user 1.68 s, sys: 3.06 ms, total: 1.68 s
Wall time: 1.82 s


masks, scores, logits = predictor.predict(
    point_coords=np.array([[200, 200], [1, 1]]),  # point coords
    point_labels=np.array([1, 0]),  # 1 - object(foreground), 0 - background
    # box
    # mask_input
    multimask_output=True,  # return 1 or 3 masks because of the ambiguous input
)


print("Masks count", len(masks))
print("Scores", scores)

Masks count 3
Scores [    0.85098     0.95786     0.98574]


print(masks[0].shape)

(240, 320)


def show_mask(mask, ax, random_color=False):
    if random_color:
        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
    else:
        color = np.array([30 / 255, 144 / 255, 255 / 255, 0.6])
    h, w = mask.shape[-2:]
    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
    ax.imshow(mask_image)


plt.imshow(img)
show_mask(masks[2], plt.gca())
plt.axis("off")
plt.show()


def precision(TP, FP):
    return TP / (TP + FP)


def recall(TP, FN):
    return TP / (TP + FN)


pres = precision(TP=7, FP=4)
rec = recall(TP=7, FN=3)

print("Precision = %.2f" % pres)
print("Recall = %.2f" % rec)

Precision = 0.64
Recall = 0.70


import numpy as np

y_true = np.array([1, 1, 0])  # Labels
y_pred = np.array([0.8, 0.1, 0.2])  # Predictions


import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve


def pr_curve(y_true, y_pred):
    plt.figure(figsize=(5, 5))
    precision, recall, thresholds = precision_recall_curve(y_true, y_pred)
    plt.plot(recall, precision, marker="o")
    for i, t in enumerate(thresholds):
        plt.annotate(str(t), (recall[i], precision[i]))
    plt.ylim([0, 1.1])
    plt.xlabel("recall")
    plt.ylabel("precision")
    plt.title("PR curve")
    plt.show()


pr_curve(y_true, y_pred)


import pandas as pd

detections = np.array(
    [
        [290, 50, 170, 160, 0.7, 53],  # x, y, w, h, confidence, class_num
        [10, 200, 190, 180, 0.8, 53],
        [310, 250, 120, 130, 0.75, 53],
    ]
)
clear_output()
pd.DataFrame(
    data=detections, columns=["x", "y", "width", "height", "confidence", "class_num"]
)


!wget -qN "http://images.cocodataset.org/annotations/annotations_trainval2017.zip"
# !wget -qN "https://edunet.kea.su/repo/EduNet-web_dependencies/datasets/annotations_trainval2017.zip"
!unzip -qn annotations_trainval2017.zip


from pycocotools.coco import COCO
from IPython.display import clear_output


coco = COCO("annotations/instances_val2017.json")
clear_output()

apples_img_id = 60855  # if of some image with apples
apple_cat_id = 53  # apple

ann_id = coco.getAnnIds(imgIds=[apples_img_id])
anns = coco.loadAnns(ann_id)

gt_bbox = []
for ann in anns:
    if ann["category_id"] == apple_cat_id:
        gt_bbox.append(ann["bbox"] + [ann["category_id"]])


pd.DataFrame(data=gt_bbox, columns=["x", "y", "width", "height", "class_num"])

Done (t=0.77s)
creating index...
index created!


import torch
import skimage
from torchvision.ops import box_convert
from torchvision.utils import draw_bounding_boxes


def draw_bbox(img, bb_xywh, color):
    t_img = torch.tensor(img).permute(2, 0, 1)  # to tensor CHW
    xywh = torch.tensor(bb_xywh)[:, :4]  # take only coords
    bb = box_convert(xywh, "xywh", "xyxy")  # convert from COCO format
    img_with_bb = draw_bounding_boxes(t_img, bb, colors=color, width=4)
    return img_with_bb.permute(1, 2, 0).numpy()  # back to numpy HWC


img_info = coco.loadImgs(apples_img_id)

img = skimage.io.imread(img_info[0]["coco_url"])
img = draw_bbox(img, detections, "blue")
img = draw_bbox(img, gt_bbox, "lime")
plt.figure(figsize=(8, 6))
plt.imshow(img)
plt.show()


gt = box_convert(torch.tensor(gt_bbox)[:, :4], "xywh", "xyxy")
pred = box_convert(torch.tensor(detections)[:, :4], "xywh", "xyxy")


from torchvision.ops import box_iou

pairwise_iou = box_iou(gt, pred)
print(pairwise_iou)


iou, _ = pairwise_iou.max(dim=0)
print(iou)

tensor([0.6153, 0.8595, 0.0000], dtype=torch.float64)


predictions = torch.tensor(detections)[:, 4]
predictions = torch.vstack((predictions, iou)).T

pd.DataFrame(data=predictions, columns=["confidence", "iou"])


gt = iou > 0.5

predictions = torch.vstack((predictions[:, 0], gt)).T
detection_results = pd.DataFrame(data=predictions, columns=["confidence", "gt"])
detection_results.head()


plt.figure(figsize=(7, 5))
precision, recall, thresholds = precision_recall_curve(gt, predictions[:, 0])
plt.plot(recall, precision, marker="o")
plt.ylim([0, 1.1])
plt.xlabel("recall")
plt.ylabel("precision")
plt.title("PR curve")
plt.show()


from sklearn.metrics import average_precision_score

ap = average_precision_score(gt, predictions[:, 0])
print("AP", ap)

AP 0.8333333333333333


from sklearn import metrics

auc = metrics.auc(recall, precision)
print("AUC", auc)

AUC 0.7916666666666666


def smooth_precision(precision):
    smooth_prec = []
    for i in range(1, len(precision) + 1):
        max = precision[:i].max()
        smooth_prec.append(max)
    return smooth_prec


detection_results = detection_results.sort_values("confidence", ascending=False)
precision, recall, thresholds = precision_recall_curve(
    detection_results["gt"], detection_results["confidence"]
)

smoothed_precision = smooth_precision(precision)
plt.figure(figsize=(7, 5))
plt.plot(recall, precision, marker="o")
plt.plot(recall, smoothed_precision, marker="o")
plt.ylim([0, 1.1])
plt.xlabel("recall")
plt.ylabel("precision")
plt.title("PR curve")
plt.show()


smooth_auc = metrics.auc(recall, smoothed_precision)
print("smooth AUC ", smooth_auc)

smooth AUC  0.8333333333333333


!pip install -q torchmetrics


from torchmetrics.detection.mean_ap import MeanAveragePrecision

t_gt_bbox = torch.tensor(gt_bbox)
t_detections = torch.tensor(detections)

map_obj = MeanAveragePrecision("xywh", iou_thresholds=[0.5, 0.55, 0.6, 0.65, 0.7, 0.75])
results = map_obj(
    preds=[
        {
            "boxes": t_detections[:, :4],  # xywh
            "scores": t_detections[:, 4],  # confidence
            "labels": t_detections[:, 5].int(),  # class num
        }
    ],
    target=[
        {
            "boxes": t_gt_bbox[:, :4],
            "labels": t_gt_bbox[:, 4].int(),
        }  # xywh  # class num
    ],
)

print("mAP@[0.5:0.75] = ", results["map"].item())

mAP@[0.5:0.75] =  0.669966995716095

	x	y	width	height	confidence	class_num
0	290.0	50.0	170.0	160.0	0.70	53.0
1	10.0	200.0	190.0	180.0	0.80	53.0
2	310.0	250.0	120.0	130.0	0.75	53.0

	x	y	width	height	class_num
0	269.44	61.96	154.91	161.48	53
1	24.26	208.04	177.04	176.13	53

Задачи компьютерного зрения¶

Dataset COCO — Common Objects in Context¶

Категории в COCO¶

Разметка данных¶

Семантическая сегментация (Semantic segmentation)¶

Способы предсказания класса для каждого пикселя¶

Fully Convolutional Networks¶

Разжимающий слой¶

Интерполяция при увеличении разрешения¶

Upsample в PyTorch¶

Другие способы "разжать" карту признаков¶

MaxUnpooling¶

Transposed convolution¶

Пирамида признаков¶

Метрики¶

IoU¶

Dice¶

Loss функции для сегментации¶

Distribution-based loss¶

Binary Cross-Entropy (BCE)¶

Cross-Entropy¶

Region-based loss¶

Jaccard Loss¶

Dice loss¶

U-Net: Convolutional Networks for Biomedical Image Segmentation¶

Обзор DeepLabv3+ (2018)¶

Atrous (Dilated) Convolution¶

Segmentation models PyTorch (SMP)¶

Совместимость с timm¶

Совместимость с SMP¶

Детектирование (Object detection)¶

Детектирование единственного объекта¶

Предсказание координат¶

Multitask loss¶

Подбор весов для каждой компоненты loss¶

Детектирование нескольких объектов¶

Object proposal¶

NMS¶

Backbone для детекторов¶

Path Aggregation Network Module(PAN)¶

Swin Transformer (2021)¶

YOLO¶

Instance Segmentation¶

Panoptic Segmentation¶

OWL-ViT2 (Jul 2023)¶

SAM (2023)¶

Оценка качества детекции¶

mAP — mean Average Precision¶

Precision & recall¶

PRCurve¶

IoU вместо метки класса¶

Average Precision¶

COCO mAP¶

AP для разных порогов.¶

Практические соображения¶