import torch
import random
import numpy as np

# fix random_seed
torch.manual_seed(42)
random.seed(42)
np.random.seed(42)

# compute in cpu or gpu
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# Full list of labels
#'https://s3.amazonaws.com/deep-learning-models/image-models/imagenet_class_index.json'
!wget -q https://edunet.kea.su/repo/EduNet-web_dependencies/datasets/imagenet_class_index.json

# https://github.com/ajschumacher/imagen.git
!wget -q https://edunet.kea.su/repo/EduNet-web_dependencies/datasets/imagen.zip
!unzip -q imagen.zip


import json
import pprint
import numpy as np

pp = pprint.PrettyPrinter(width=41, compact=True)

with open("imagenet_class_index.json") as f:
    imagenet_labels = json.load(f)

classes = np.array(list(imagenet_labels.values()))[:, 1]

pp.pprint(
    dict(list(imagenet_labels.items())[:10])
)  # Use Pretty Print to display long dict

{'0': ['n01440764', 'tench'],
 '1': ['n01443537', 'goldfish'],
 '2': ['n01484850', 'great_white_shark'],
 '3': ['n01491361', 'tiger_shark'],
 '4': ['n01494475', 'hammerhead'],
 '5': ['n01496331', 'electric_ray'],
 '6': ['n01498041', 'stingray'],
 '7': ['n01514668', 'cock'],
 '8': ['n01514859', 'hen'],
 '9': ['n01518878', 'ostrich']}


from glob import glob
from PIL import Image
from torch.utils.data import Dataset


class MicroImageNet(Dataset):
    def __init__(self):
        super().__init__()
        # Load labels
        self.num2id = {}
        with open("imagenet_class_index.json") as f:
            imagenet_labels = json.load(f)
        w_net = {}
        # Because not all world net image codes from imagen exists in imagenet_labels
        # we need to filter this image
        for key in imagenet_labels.keys():
            wn_id = imagenet_labels[key][0]
            w_net[wn_id] = {"num": int(key), "name": imagenet_labels[key][1]}
        self.labels = []
        self.paths = []

        # Load data
        images = glob("imagen/*.jpg")
        images.sort()
        for i, path in enumerate(images):
            name = path.split("_")[2]  # Class name
            id = path.split("_")[0][7:]  # WorldNet based ID
            if w_net.get(id, None):
                self.labels.append([w_net[id]["num"], w_net[id]["name"], id])
                self.paths.append(path)

    def __getitem__(self, idx):
        im = Image.open(self.paths[idx])
        class_num = self.labels[idx][0]
        return im, class_num

    def __len__(self):
        return len(self.paths)


microImgNet = MicroImageNet()


import matplotlib.pyplot as plt

plt.rcParams["figure.figsize"] = (15, 10)


def show(img, label_1, num, label_2=""):
    ax = plt.subplot(2, 3, num + 1)
    plt.imshow(img)
    plt.title(label_1)
    ax.set_xlabel(label_2)
    plt.axis("off")


for i in range(6):
    img, label = microImgNet[i * 6]
    name = microImgNet.labels[i * 6][1]
    show(img, name, i)


from torchvision import models

alexnet = models.alexnet(weights="AlexNet_Weights.DEFAULT")

Downloading: "https://download.pytorch.org/models/alexnet-owt-7be5be79.pth" to /root/.cache/torch/hub/checkpoints/alexnet-owt-7be5be79.pth
100%|██████████| 233M/233M [00:02<00:00, 97.6MB/s]


from torchsummary import summary

print("AlexNet architecture")
print(summary(alexnet, (3, 224, 224), device="cpu"))
print(alexnet)

AlexNet architecture
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
================================================================
            Conv2d-1           [-1, 64, 55, 55]          23,296
              ReLU-2           [-1, 64, 55, 55]               0
         MaxPool2d-3           [-1, 64, 27, 27]               0
            Conv2d-4          [-1, 192, 27, 27]         307,392
              ReLU-5          [-1, 192, 27, 27]               0
         MaxPool2d-6          [-1, 192, 13, 13]               0
            Conv2d-7          [-1, 384, 13, 13]         663,936
              ReLU-8          [-1, 384, 13, 13]               0
            Conv2d-9          [-1, 256, 13, 13]         884,992
             ReLU-10          [-1, 256, 13, 13]               0
           Conv2d-11          [-1, 256, 13, 13]         590,080
             ReLU-12          [-1, 256, 13, 13]               0
        MaxPool2d-13            [-1, 256, 6, 6]               0
AdaptiveAvgPool2d-14            [-1, 256, 6, 6]               0
          Dropout-15                 [-1, 9216]               0
           Linear-16                 [-1, 4096]      37,752,832
             ReLU-17                 [-1, 4096]               0
          Dropout-18                 [-1, 4096]               0
           Linear-19                 [-1, 4096]      16,781,312
             ReLU-20                 [-1, 4096]               0
           Linear-21                 [-1, 1000]       4,097,000
================================================================
Total params: 61,100,840
Trainable params: 61,100,840
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.57
Forward/backward pass size (MB): 8.38
Params size (MB): 233.08
Estimated Total Size (MB): 242.03
----------------------------------------------------------------
None
AlexNet(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
    (4): ReLU(inplace=True)
    (5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU(inplace=True)
    (8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (avgpool): AdaptiveAvgPool2d(output_size=(6, 6))
  (classifier): Sequential(
    (0): Dropout(p=0.5, inplace=False)
    (1): Linear(in_features=9216, out_features=4096, bias=True)
    (2): ReLU(inplace=True)
    (3): Dropout(p=0.5, inplace=False)
    (4): Linear(in_features=4096, out_features=4096, bias=True)
    (5): ReLU(inplace=True)
    (6): Linear(in_features=4096, out_features=1000, bias=True)
  )
)


import torch
import torchvision.transforms.functional as F


def img2tensor(img):
    t = F.to_tensor(img)
    t = F.normalize(t, (0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    return t


def catId2names(nums):
    titles = []
    for num in nums:
        titles.append(imagenet_labels[str(num.item())][1])
        titles.reverse()
    return ", ".join(titles)


for i in range(6, 12):
    img, label = microImgNet[i * 6]
    tensor = img2tensor(img)
    out = alexnet(tensor.unsqueeze(0))  # Add batch dimension
    labels_num = torch.argsort(out[0])  # Ascending order
    weights = out[0][-5:]
    predicted = catId2names(labels_num[-5:])  # Top 5
    titles = []
    name = microImgNet.labels[i * 6][1]
    show(img, name, i - 6, predicted)


from torchvision import models

vgg = models.vgg16(
    weights=None
)  # Change on True if you want to use VGG to predict something
print(vgg)

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): ReLU(inplace=True)
    (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (15): ReLU(inplace=True)
    (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (17): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (18): ReLU(inplace=True)
    (19): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (20): ReLU(inplace=True)
    (21): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (22): ReLU(inplace=True)
    (23): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (24): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (25): ReLU(inplace=True)
    (26): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (27): ReLU(inplace=True)
    (28): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (29): ReLU(inplace=True)
    (30): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (avgpool): AdaptiveAvgPool2d(output_size=(7, 7))
  (classifier): Sequential(
    (0): Linear(in_features=25088, out_features=4096, bias=True)
    (1): ReLU(inplace=True)
    (2): Dropout(p=0.5, inplace=False)
    (3): Linear(in_features=4096, out_features=4096, bias=True)
    (4): ReLU(inplace=True)
    (5): Dropout(p=0.5, inplace=False)
    (6): Linear(in_features=4096, out_features=1000, bias=True)
  )
)


import torch.nn as nn

conv_sizes = [11, 7, 5, 3]

for conv_size in conv_sizes:
    conv_layer = nn.Conv2d(3, 64, conv_size, stride=1, padding=1)
    print("Convolution size: %ix%i" % (conv_size, conv_size))
    for tag, p in conv_layer.named_parameters():
        print("Memory reqired for %s: %.2f kb" % (tag, (np.prod(p.shape) * 4) / 1024))

Convolution size: 11x11
Memory reqired for weight: 90.75 kb
Memory reqired for bias: 0.25 kb
Convolution size: 7x7
Memory reqired for weight: 36.75 kb
Memory reqired for bias: 0.25 kb
Convolution size: 5x5
Memory reqired for weight: 18.75 kb
Memory reqired for bias: 0.25 kb
Convolution size: 3x3
Memory reqired for weight: 6.75 kb
Memory reqired for bias: 0.25 kb


!nvidia-smi

Tue Jul 25 12:51:18 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P8     9W /  70W |      3MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
|  No running processes found                                                 |
+-----------------------------------------------------------------------------+


!pip install -q GPUtil

  Preparing metadata (setup.py) ... done
  Building wheel for GPUtil (setup.py) ... done


import GPUtil as GPU
import psutil
import os


def gpu_usage():
    GPUs = GPU.getGPUs()
    # XXX: only one GPU on Colab and isn’t guaranteed
    if len(GPUs) == 0:
        return False
    gpu = GPUs[0]
    process = psutil.Process(os.getpid())
    print(
        f"GPU RAM Free: {gpu.memoryFree:.0f}MB \
    | Used: {gpu.memoryUsed:.0f}MB \
    | Util {gpu.memoryUtil*100:3.0f}% \
    | Total {gpu.memoryTotal:.0f}MB"
    )


import torchvision
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

vgg19 = torchvision.models.vgg19(weights=None, progress=True)
vgg19.requires_grad = True
vgg19.to(device)

gpu_usage()  # Common GPU info

vgg19.train()

for batch_size in [1, 8, 16, 32, 64]:
    input_random = torch.rand(batch_size, 3, 224, 224, device=device)
    out = vgg19(input_random)
    print("Batch size", batch_size)
    gpu_usage()

GPU RAM Free: 13928MB     | Used: 1173MB     | Util   8%     | Total 15360MB
Batch size 1
GPU RAM Free: 13392MB     | Used: 1709MB     | Util  11%     | Total 15360MB
Batch size 8
GPU RAM Free: 12578MB     | Used: 2523MB     | Util  16%     | Total 15360MB
Batch size 16
GPU RAM Free: 11498MB     | Used: 3603MB     | Util  23%     | Total 15360MB
Batch size 32
GPU RAM Free: 8948MB     | Used: 6153MB     | Util  40%     | Total 15360MB
Batch size 64
GPU RAM Free: 4632MB     | Used: 10469MB     | Util  68%     | Total 15360MB


torch.cuda.empty_cache()
gpu_usage()

GPU RAM Free: 4682MB     | Used: 10419MB     | Util  68%     | Total 15360MB


input_random = None  # del input
out = None  # del out
gpu_usage()

GPU RAM Free: 4682MB     | Used: 10419MB     | Util  68%     | Total 15360MB


torch.cuda.empty_cache()
gpu_usage()

GPU RAM Free: 6076MB     | Used: 9025MB     | Util  59%     | Total 15360MB


vgg19 = None
gpu_usage()

GPU RAM Free: 6076MB     | Used: 9025MB     | Util  59%     | Total 15360MB


torch.cuda.empty_cache()


import torchvision

# https://pytorch.org/vision/stable/_modules/torchvision/models/googlenet.html#googlenet
# https://hackmd.io/@bouteille/Bk-61Fo8U
googlenet = torchvision.models.googlenet(init_weights=True)
print(googlenet)

GoogLeNet(
  (conv1): BasicConv2d(
    (conv): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (maxpool1): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=True)
  (conv2): BasicConv2d(
    (conv): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (conv3): BasicConv2d(
    (conv): Conv2d(64, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn): BatchNorm2d(192, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
  )
  (maxpool2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=True)
  (inception3a): Inception(
    (branch1): BasicConv2d(
      (conv): Conv2d(192, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    )
    (branch2): Sequential(
      (0): BasicConv2d(
        (conv): Conv2d(192, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(96, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicConv2d(
        (conv): Conv2d(96, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(128, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (branch3): Sequential(
      (0): BasicConv2d(
        (conv): Conv2d(192, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(16, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicConv2d(
        (conv): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (branch4): Sequential(
      (0): MaxPool2d(kernel_size=3, stride=1, padding=1, dilation=1, ceil_mode=True)
      (1): BasicConv2d(
        (conv): Conv2d(192, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
  )
  (inception3b): Inception(
    (branch1): BasicConv2d(
      (conv): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn): BatchNorm2d(128, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    )
    (branch2): Sequential(
      (0): BasicConv2d(
        (conv): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(128, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicConv2d(
        (conv): Conv2d(128, 192, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(192, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (branch3): Sequential(
      (0): BasicConv2d(
        (conv): Conv2d(256, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicConv2d(
        (conv): Conv2d(32, 96, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(96, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (branch4): Sequential(
      (0): MaxPool2d(kernel_size=3, stride=1, padding=1, dilation=1, ceil_mode=True)
      (1): BasicConv2d(
        (conv): Conv2d(256, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
  )
  (maxpool3): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=True)
  (inception4a): Inception(
    (branch1): BasicConv2d(
      (conv): Conv2d(480, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn): BatchNorm2d(192, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    )
    (branch2): Sequential(
      (0): BasicConv2d(
        (conv): Conv2d(480, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(96, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicConv2d(
        (conv): Conv2d(96, 208, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(208, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (branch3): Sequential(
      (0): BasicConv2d(
        (conv): Conv2d(480, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(16, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicConv2d(
        (conv): Conv2d(16, 48, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(48, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (branch4): Sequential(
      (0): MaxPool2d(kernel_size=3, stride=1, padding=1, dilation=1, ceil_mode=True)
      (1): BasicConv2d(
        (conv): Conv2d(480, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
  )
  (inception4b): Inception(
    (branch1): BasicConv2d(
      (conv): Conv2d(512, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn): BatchNorm2d(160, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    )
    (branch2): Sequential(
      (0): BasicConv2d(
        (conv): Conv2d(512, 112, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(112, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicConv2d(
        (conv): Conv2d(112, 224, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(224, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (branch3): Sequential(
      (0): BasicConv2d(
        (conv): Conv2d(512, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(24, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicConv2d(
        (conv): Conv2d(24, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (branch4): Sequential(
      (0): MaxPool2d(kernel_size=3, stride=1, padding=1, dilation=1, ceil_mode=True)
      (1): BasicConv2d(
        (conv): Conv2d(512, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
  )
  (inception4c): Inception(
    (branch1): BasicConv2d(
      (conv): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn): BatchNorm2d(128, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    )
    (branch2): Sequential(
      (0): BasicConv2d(
        (conv): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(128, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicConv2d(
        (conv): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(256, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (branch3): Sequential(
      (0): BasicConv2d(
        (conv): Conv2d(512, 24, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(24, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicConv2d(
        (conv): Conv2d(24, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (branch4): Sequential(
      (0): MaxPool2d(kernel_size=3, stride=1, padding=1, dilation=1, ceil_mode=True)
      (1): BasicConv2d(
        (conv): Conv2d(512, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
  )
  (inception4d): Inception(
    (branch1): BasicConv2d(
      (conv): Conv2d(512, 112, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn): BatchNorm2d(112, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    )
    (branch2): Sequential(
      (0): BasicConv2d(
        (conv): Conv2d(512, 144, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(144, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicConv2d(
        (conv): Conv2d(144, 288, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(288, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (branch3): Sequential(
      (0): BasicConv2d(
        (conv): Conv2d(512, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicConv2d(
        (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (branch4): Sequential(
      (0): MaxPool2d(kernel_size=3, stride=1, padding=1, dilation=1, ceil_mode=True)
      (1): BasicConv2d(
        (conv): Conv2d(512, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
  )
  (inception4e): Inception(
    (branch1): BasicConv2d(
      (conv): Conv2d(528, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn): BatchNorm2d(256, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    )
    (branch2): Sequential(
      (0): BasicConv2d(
        (conv): Conv2d(528, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(160, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicConv2d(
        (conv): Conv2d(160, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(320, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (branch3): Sequential(
      (0): BasicConv2d(
        (conv): Conv2d(528, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicConv2d(
        (conv): Conv2d(32, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(128, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (branch4): Sequential(
      (0): MaxPool2d(kernel_size=3, stride=1, padding=1, dilation=1, ceil_mode=True)
      (1): BasicConv2d(
        (conv): Conv2d(528, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(128, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
  )
  (maxpool4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=True)
  (inception5a): Inception(
    (branch1): BasicConv2d(
      (conv): Conv2d(832, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn): BatchNorm2d(256, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    )
    (branch2): Sequential(
      (0): BasicConv2d(
        (conv): Conv2d(832, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(160, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicConv2d(
        (conv): Conv2d(160, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(320, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (branch3): Sequential(
      (0): BasicConv2d(
        (conv): Conv2d(832, 32, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(32, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicConv2d(
        (conv): Conv2d(32, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(128, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (branch4): Sequential(
      (0): MaxPool2d(kernel_size=3, stride=1, padding=1, dilation=1, ceil_mode=True)
      (1): BasicConv2d(
        (conv): Conv2d(832, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(128, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
  )
  (inception5b): Inception(
    (branch1): BasicConv2d(
      (conv): Conv2d(832, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn): BatchNorm2d(384, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    )
    (branch2): Sequential(
      (0): BasicConv2d(
        (conv): Conv2d(832, 192, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(192, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicConv2d(
        (conv): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(384, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (branch3): Sequential(
      (0): BasicConv2d(
        (conv): Conv2d(832, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(48, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicConv2d(
        (conv): Conv2d(48, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn): BatchNorm2d(128, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
    (branch4): Sequential(
      (0): MaxPool2d(kernel_size=3, stride=1, padding=1, dilation=1, ceil_mode=True)
      (1): BasicConv2d(
        (conv): Conv2d(832, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (bn): BatchNorm2d(128, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
      )
    )
  )
  (aux1): InceptionAux(
    (conv): BasicConv2d(
      (conv): Conv2d(512, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn): BatchNorm2d(128, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    )
    (fc1): Linear(in_features=2048, out_features=1024, bias=True)
    (fc2): Linear(in_features=1024, out_features=1000, bias=True)
    (dropout): Dropout(p=0.7, inplace=False)
  )
  (aux2): InceptionAux(
    (conv): BasicConv2d(
      (conv): Conv2d(528, 128, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn): BatchNorm2d(128, eps=0.001, momentum=0.1, affine=True, track_running_stats=True)
    )
    (fc1): Linear(in_features=2048, out_features=1024, bias=True)
    (fc2): Linear(in_features=1024, out_features=1000, bias=True)
    (dropout): Dropout(p=0.7, inplace=False)
  )
  (avgpool): AdaptiveAvgPool2d(output_size=(1, 1))
  (dropout): Dropout(p=0.2, inplace=False)
  (fc): Linear(in_features=1024, out_features=1000, bias=True)
)


gap = torch.nn.AdaptiveAvgPool2d(1)
dummy_input = torch.randn(1, 3, 6, 6)
out = gap(dummy_input)
print("Raw out shape", out.shape)
out = nn.Flatten()(out)
print("Flatten out shape", out.shape)

Raw out shape torch.Size([1, 3, 1, 1])
Flatten out shape torch.Size([1, 3])


import torch.nn as nn
from PIL import Image
import torch


def file2tensor(filename):
    img = Image.open(filename)
    t = torchvision.transforms.functional.to_tensor(img)
    t = torchvision.transforms.functional.normalize(
        t, (0.485, 0.456, 0.406), (0.229, 0.224, 0.225)
    )
    return t


class CNNfromHW(nn.Module):
    def __init__(self, conv_module=None):
        super().__init__()
        self.activation = nn.ReLU()
        self.conv1 = nn.Conv2d(3, 16, 5, padding=2)  # 16xHxW
        self.pool = nn.MaxPool2d(2, 2)  # 16 x H/2 x W/2
        self.conv2 = nn.Conv2d(16, 32, 3, padding=1)  # 32 x H/2 x W/2
        self.gap = nn.AdaptiveAvgPool2d((1, 1))  # Any spatial size -> 32x1x1
        self.fc = nn.Linear(32, 10)

    def forward(self, x):
        print("Input shape", x.shape)
        x = self.conv1(x)  # 16xHxW
        x = self.pool(x)  # 16 x H/2 x W/2
        x = self.conv2(x)  # 32 x H/2 x W/2
        x = self.activation(x)  # Any spatial size -> 32x1x1
        x = self.gap(x)
        scores = self.fc(x.flatten(1))
        print("Output shape", scores.shape)
        return scores


print("CIFAR10 like")
input_random = torch.rand(1, 3, 32, 32)
model_with_gap = CNNfromHW()
out = model_with_gap(input_random)


print("Arbitrary size")
# Different sizes work too!
aramdillo_t = file2tensor("imagen/n02454379_10511_armadillo.jpg")
out = model_with_gap(aramdillo_t.unsqueeze(0))

CIFAR10 like
Input shape torch.Size([1, 3, 32, 32])
Output shape torch.Size([1, 10])
Arbitrary size
Input shape torch.Size([1, 3, 500, 500])
Output shape torch.Size([1, 10])


import inspect
import torchvision.models.resnet as resnet

# BasicBlock
code = inspect.getsource(resnet.BasicBlock.forward)
print(code)

    def forward(self, x: Tensor) -> Tensor:
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out


from torchvision import models

resnet = models.resnet18(weights=None)
print(resnet.layer2)

Sequential(
  (0): BasicBlock(
    (conv1): Conv2d(64, 128, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (downsample): Sequential(
      (0): Conv2d(64, 128, kernel_size=(1, 1), stride=(2, 2), bias=False)
      (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (1): BasicBlock(
    (conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
)


# CPU test
from torch import nn
import time
import torch


def time_synchronized():
    torch.cuda.synchronize() if torch.cuda.is_available() else None
    return time.time()


input_random = torch.rand(8, 512, 112, 112)
start = time_synchronized()
normal_conv = nn.Conv2d(512, 1024, 3, groups=1)
out = normal_conv(input_random)
tm = time_synchronized() - start
print(f"Normal convolution take  {tm} sec.")

start = time_synchronized()
groupped_conv = nn.Conv2d(512, 1024, 3, groups=64)
out = groupped_conv(input_random)
tm = time_synchronized() - start
print(f"Groupped convolution take  {tm} sec.")

Normal convolution take  13.793790578842163 sec.
Groupped convolution take  0.8466033935546875 sec.


# GPU test
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
start = time_synchronized()
normal_conv = nn.Conv2d(512, 1024, 3, groups=1).to(device)
out = normal_conv(input_random.to(device))
tm = time_synchronized() - start
print(f"Normal convolution take  {tm} sec.")

start = time_synchronized()
groupped_conv = nn.Conv2d(512, 1024, 3, groups=64).to(device)
out = groupped_conv(input_random.to(device))
tm = time_synchronized() - start
print(f"Groupped convolution take  {tm} sec.")

Normal convolution take  0.2872188091278076 sec.
Groupped convolution take  0.06319403648376465 sec.


input_random = None
out = None


from torchvision import models
from torchsummary import summary

resnext = models.resnext50_32x4d(weights=None)

print(summary(resnext, (3, 224, 224), device="cpu"))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
================================================================
            Conv2d-1         [-1, 64, 112, 112]           9,408
       BatchNorm2d-2         [-1, 64, 112, 112]             128
              ReLU-3         [-1, 64, 112, 112]               0
         MaxPool2d-4           [-1, 64, 56, 56]               0
            Conv2d-5          [-1, 128, 56, 56]           8,192
       BatchNorm2d-6          [-1, 128, 56, 56]             256
              ReLU-7          [-1, 128, 56, 56]               0
            Conv2d-8          [-1, 128, 56, 56]           4,608
       BatchNorm2d-9          [-1, 128, 56, 56]             256
             ReLU-10          [-1, 128, 56, 56]               0
           Conv2d-11          [-1, 256, 56, 56]          32,768
      BatchNorm2d-12          [-1, 256, 56, 56]             512
           Conv2d-13          [-1, 256, 56, 56]          16,384
      BatchNorm2d-14          [-1, 256, 56, 56]             512
             ReLU-15          [-1, 256, 56, 56]               0
       Bottleneck-16          [-1, 256, 56, 56]               0
           Conv2d-17          [-1, 128, 56, 56]          32,768
      BatchNorm2d-18          [-1, 128, 56, 56]             256
             ReLU-19          [-1, 128, 56, 56]               0
           Conv2d-20          [-1, 128, 56, 56]           4,608
      BatchNorm2d-21          [-1, 128, 56, 56]             256
             ReLU-22          [-1, 128, 56, 56]               0
           Conv2d-23          [-1, 256, 56, 56]          32,768
      BatchNorm2d-24          [-1, 256, 56, 56]             512
             ReLU-25          [-1, 256, 56, 56]               0
       Bottleneck-26          [-1, 256, 56, 56]               0
           Conv2d-27          [-1, 128, 56, 56]          32,768
      BatchNorm2d-28          [-1, 128, 56, 56]             256
             ReLU-29          [-1, 128, 56, 56]               0
           Conv2d-30          [-1, 128, 56, 56]           4,608
      BatchNorm2d-31          [-1, 128, 56, 56]             256
             ReLU-32          [-1, 128, 56, 56]               0
           Conv2d-33          [-1, 256, 56, 56]          32,768
      BatchNorm2d-34          [-1, 256, 56, 56]             512
             ReLU-35          [-1, 256, 56, 56]               0
       Bottleneck-36          [-1, 256, 56, 56]               0
           Conv2d-37          [-1, 256, 56, 56]          65,536
      BatchNorm2d-38          [-1, 256, 56, 56]             512
             ReLU-39          [-1, 256, 56, 56]               0
           Conv2d-40          [-1, 256, 28, 28]          18,432
      BatchNorm2d-41          [-1, 256, 28, 28]             512
             ReLU-42          [-1, 256, 28, 28]               0
           Conv2d-43          [-1, 512, 28, 28]         131,072
      BatchNorm2d-44          [-1, 512, 28, 28]           1,024
           Conv2d-45          [-1, 512, 28, 28]         131,072
      BatchNorm2d-46          [-1, 512, 28, 28]           1,024
             ReLU-47          [-1, 512, 28, 28]               0
       Bottleneck-48          [-1, 512, 28, 28]               0
           Conv2d-49          [-1, 256, 28, 28]         131,072
      BatchNorm2d-50          [-1, 256, 28, 28]             512
             ReLU-51          [-1, 256, 28, 28]               0
           Conv2d-52          [-1, 256, 28, 28]          18,432
      BatchNorm2d-53          [-1, 256, 28, 28]             512
             ReLU-54          [-1, 256, 28, 28]               0
           Conv2d-55          [-1, 512, 28, 28]         131,072
      BatchNorm2d-56          [-1, 512, 28, 28]           1,024
             ReLU-57          [-1, 512, 28, 28]               0
       Bottleneck-58          [-1, 512, 28, 28]               0
           Conv2d-59          [-1, 256, 28, 28]         131,072
      BatchNorm2d-60          [-1, 256, 28, 28]             512
             ReLU-61          [-1, 256, 28, 28]               0
           Conv2d-62          [-1, 256, 28, 28]          18,432
      BatchNorm2d-63          [-1, 256, 28, 28]             512
             ReLU-64          [-1, 256, 28, 28]               0
           Conv2d-65          [-1, 512, 28, 28]         131,072
      BatchNorm2d-66          [-1, 512, 28, 28]           1,024
             ReLU-67          [-1, 512, 28, 28]               0
       Bottleneck-68          [-1, 512, 28, 28]               0
           Conv2d-69          [-1, 256, 28, 28]         131,072
      BatchNorm2d-70          [-1, 256, 28, 28]             512
             ReLU-71          [-1, 256, 28, 28]               0
           Conv2d-72          [-1, 256, 28, 28]          18,432
      BatchNorm2d-73          [-1, 256, 28, 28]             512
             ReLU-74          [-1, 256, 28, 28]               0
           Conv2d-75          [-1, 512, 28, 28]         131,072
      BatchNorm2d-76          [-1, 512, 28, 28]           1,024
             ReLU-77          [-1, 512, 28, 28]               0
       Bottleneck-78          [-1, 512, 28, 28]               0
           Conv2d-79          [-1, 512, 28, 28]         262,144
      BatchNorm2d-80          [-1, 512, 28, 28]           1,024
             ReLU-81          [-1, 512, 28, 28]               0
           Conv2d-82          [-1, 512, 14, 14]          73,728
      BatchNorm2d-83          [-1, 512, 14, 14]           1,024
             ReLU-84          [-1, 512, 14, 14]               0
           Conv2d-85         [-1, 1024, 14, 14]         524,288
      BatchNorm2d-86         [-1, 1024, 14, 14]           2,048
           Conv2d-87         [-1, 1024, 14, 14]         524,288
      BatchNorm2d-88         [-1, 1024, 14, 14]           2,048
             ReLU-89         [-1, 1024, 14, 14]               0
       Bottleneck-90         [-1, 1024, 14, 14]               0
           Conv2d-91          [-1, 512, 14, 14]         524,288
      BatchNorm2d-92          [-1, 512, 14, 14]           1,024
             ReLU-93          [-1, 512, 14, 14]               0
           Conv2d-94          [-1, 512, 14, 14]          73,728
      BatchNorm2d-95          [-1, 512, 14, 14]           1,024
             ReLU-96          [-1, 512, 14, 14]               0
           Conv2d-97         [-1, 1024, 14, 14]         524,288
      BatchNorm2d-98         [-1, 1024, 14, 14]           2,048
             ReLU-99         [-1, 1024, 14, 14]               0
      Bottleneck-100         [-1, 1024, 14, 14]               0
          Conv2d-101          [-1, 512, 14, 14]         524,288
     BatchNorm2d-102          [-1, 512, 14, 14]           1,024
            ReLU-103          [-1, 512, 14, 14]               0
          Conv2d-104          [-1, 512, 14, 14]          73,728
     BatchNorm2d-105          [-1, 512, 14, 14]           1,024
            ReLU-106          [-1, 512, 14, 14]               0
          Conv2d-107         [-1, 1024, 14, 14]         524,288
     BatchNorm2d-108         [-1, 1024, 14, 14]           2,048
            ReLU-109         [-1, 1024, 14, 14]               0
      Bottleneck-110         [-1, 1024, 14, 14]               0
          Conv2d-111          [-1, 512, 14, 14]         524,288
     BatchNorm2d-112          [-1, 512, 14, 14]           1,024
            ReLU-113          [-1, 512, 14, 14]               0
          Conv2d-114          [-1, 512, 14, 14]          73,728
     BatchNorm2d-115          [-1, 512, 14, 14]           1,024
            ReLU-116          [-1, 512, 14, 14]               0
          Conv2d-117         [-1, 1024, 14, 14]         524,288
     BatchNorm2d-118         [-1, 1024, 14, 14]           2,048
            ReLU-119         [-1, 1024, 14, 14]               0
      Bottleneck-120         [-1, 1024, 14, 14]               0
          Conv2d-121          [-1, 512, 14, 14]         524,288
     BatchNorm2d-122          [-1, 512, 14, 14]           1,024
            ReLU-123          [-1, 512, 14, 14]               0
          Conv2d-124          [-1, 512, 14, 14]          73,728
     BatchNorm2d-125          [-1, 512, 14, 14]           1,024
            ReLU-126          [-1, 512, 14, 14]               0
          Conv2d-127         [-1, 1024, 14, 14]         524,288
     BatchNorm2d-128         [-1, 1024, 14, 14]           2,048
            ReLU-129         [-1, 1024, 14, 14]               0
      Bottleneck-130         [-1, 1024, 14, 14]               0
          Conv2d-131          [-1, 512, 14, 14]         524,288
     BatchNorm2d-132          [-1, 512, 14, 14]           1,024
            ReLU-133          [-1, 512, 14, 14]               0
          Conv2d-134          [-1, 512, 14, 14]          73,728
     BatchNorm2d-135          [-1, 512, 14, 14]           1,024
            ReLU-136          [-1, 512, 14, 14]               0
          Conv2d-137         [-1, 1024, 14, 14]         524,288
     BatchNorm2d-138         [-1, 1024, 14, 14]           2,048
            ReLU-139         [-1, 1024, 14, 14]               0
      Bottleneck-140         [-1, 1024, 14, 14]               0
          Conv2d-141         [-1, 1024, 14, 14]       1,048,576
     BatchNorm2d-142         [-1, 1024, 14, 14]           2,048
            ReLU-143         [-1, 1024, 14, 14]               0
          Conv2d-144           [-1, 1024, 7, 7]         294,912
     BatchNorm2d-145           [-1, 1024, 7, 7]           2,048
            ReLU-146           [-1, 1024, 7, 7]               0
          Conv2d-147           [-1, 2048, 7, 7]       2,097,152
     BatchNorm2d-148           [-1, 2048, 7, 7]           4,096
          Conv2d-149           [-1, 2048, 7, 7]       2,097,152
     BatchNorm2d-150           [-1, 2048, 7, 7]           4,096
            ReLU-151           [-1, 2048, 7, 7]               0
      Bottleneck-152           [-1, 2048, 7, 7]               0
          Conv2d-153           [-1, 1024, 7, 7]       2,097,152
     BatchNorm2d-154           [-1, 1024, 7, 7]           2,048
            ReLU-155           [-1, 1024, 7, 7]               0
          Conv2d-156           [-1, 1024, 7, 7]         294,912
     BatchNorm2d-157           [-1, 1024, 7, 7]           2,048
            ReLU-158           [-1, 1024, 7, 7]               0
          Conv2d-159           [-1, 2048, 7, 7]       2,097,152
     BatchNorm2d-160           [-1, 2048, 7, 7]           4,096
            ReLU-161           [-1, 2048, 7, 7]               0
      Bottleneck-162           [-1, 2048, 7, 7]               0
          Conv2d-163           [-1, 1024, 7, 7]       2,097,152
     BatchNorm2d-164           [-1, 1024, 7, 7]           2,048
            ReLU-165           [-1, 1024, 7, 7]               0
          Conv2d-166           [-1, 1024, 7, 7]         294,912
     BatchNorm2d-167           [-1, 1024, 7, 7]           2,048
            ReLU-168           [-1, 1024, 7, 7]               0
          Conv2d-169           [-1, 2048, 7, 7]       2,097,152
     BatchNorm2d-170           [-1, 2048, 7, 7]           4,096
            ReLU-171           [-1, 2048, 7, 7]               0
      Bottleneck-172           [-1, 2048, 7, 7]               0
AdaptiveAvgPool2d-173           [-1, 2048, 1, 1]               0
          Linear-174                 [-1, 1000]       2,049,000
================================================================
Total params: 25,028,904
Trainable params: 25,028,904
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.57
Forward/backward pass size (MB): 361.78
Params size (MB): 95.48
Estimated Total Size (MB): 457.83
----------------------------------------------------------------
None


## Custom SE block


class SE_Block(nn.Module):
    "credits: https://github.com/moskomule/senet.pytorch/blob/master/senet/se_module.py#L4"

    def __init__(self, c, r=16):
        super().__init__()
        self.squeeze = nn.AdaptiveAvgPool2d(1)
        self.excitation = nn.Sequential(
            nn.Linear(c, c // r, bias=False),
            nn.ReLU(inplace=True),
            nn.Linear(c // r, c, bias=False),
            nn.Sigmoid(),
        )

    def forward(self, x):
        bs, c, _, _ = x.shape
        y = self.squeeze(x).view(bs, c)
        y = self.excitation(y).view(bs, c, 1, 1)
        print("Coefficients ", y.shape)
        return x * y.expand_as(x)


dummy = torch.randn(16, 256, 7, 7)

se_block = SE_Block(256)  # for 256 channels
print("Absolute sum", dummy.abs().sum().item())
se_out = se_block(dummy)
print("Sum after se_block", se_out.abs().sum().item())

Absolute sum 160095.890625
Coefficients  torch.Size([16, 256, 1, 1])
Sum after se_block 80070.8125


from torchvision.models import efficientnet_b0
from torchsummary import summary

en_b0 = efficientnet_b0()
print(summary(en_b0, (3, 224, 224), device="cpu"))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
================================================================
            Conv2d-1         [-1, 32, 112, 112]             864
       BatchNorm2d-2         [-1, 32, 112, 112]              64
              SiLU-3         [-1, 32, 112, 112]               0
            Conv2d-4         [-1, 32, 112, 112]             288
       BatchNorm2d-5         [-1, 32, 112, 112]              64
              SiLU-6         [-1, 32, 112, 112]               0
 AdaptiveAvgPool2d-7             [-1, 32, 1, 1]               0
            Conv2d-8              [-1, 8, 1, 1]             264
              SiLU-9              [-1, 8, 1, 1]               0
           Conv2d-10             [-1, 32, 1, 1]             288
          Sigmoid-11             [-1, 32, 1, 1]               0
SqueezeExcitation-12         [-1, 32, 112, 112]               0
           Conv2d-13         [-1, 16, 112, 112]             512
      BatchNorm2d-14         [-1, 16, 112, 112]              32
           MBConv-15         [-1, 16, 112, 112]               0
           Conv2d-16         [-1, 96, 112, 112]           1,536
      BatchNorm2d-17         [-1, 96, 112, 112]             192
             SiLU-18         [-1, 96, 112, 112]               0
           Conv2d-19           [-1, 96, 56, 56]             864
      BatchNorm2d-20           [-1, 96, 56, 56]             192
             SiLU-21           [-1, 96, 56, 56]               0
AdaptiveAvgPool2d-22             [-1, 96, 1, 1]               0
           Conv2d-23              [-1, 4, 1, 1]             388
             SiLU-24              [-1, 4, 1, 1]               0
           Conv2d-25             [-1, 96, 1, 1]             480
          Sigmoid-26             [-1, 96, 1, 1]               0
SqueezeExcitation-27           [-1, 96, 56, 56]               0
           Conv2d-28           [-1, 24, 56, 56]           2,304
      BatchNorm2d-29           [-1, 24, 56, 56]              48
           MBConv-30           [-1, 24, 56, 56]               0
           Conv2d-31          [-1, 144, 56, 56]           3,456
      BatchNorm2d-32          [-1, 144, 56, 56]             288
             SiLU-33          [-1, 144, 56, 56]               0
           Conv2d-34          [-1, 144, 56, 56]           1,296
      BatchNorm2d-35          [-1, 144, 56, 56]             288
             SiLU-36          [-1, 144, 56, 56]               0
AdaptiveAvgPool2d-37            [-1, 144, 1, 1]               0
           Conv2d-38              [-1, 6, 1, 1]             870
             SiLU-39              [-1, 6, 1, 1]               0
           Conv2d-40            [-1, 144, 1, 1]           1,008
          Sigmoid-41            [-1, 144, 1, 1]               0
SqueezeExcitation-42          [-1, 144, 56, 56]               0
           Conv2d-43           [-1, 24, 56, 56]           3,456
      BatchNorm2d-44           [-1, 24, 56, 56]              48
  StochasticDepth-45           [-1, 24, 56, 56]               0
           MBConv-46           [-1, 24, 56, 56]               0
           Conv2d-47          [-1, 144, 56, 56]           3,456
      BatchNorm2d-48          [-1, 144, 56, 56]             288
             SiLU-49          [-1, 144, 56, 56]               0
           Conv2d-50          [-1, 144, 28, 28]           3,600
      BatchNorm2d-51          [-1, 144, 28, 28]             288
             SiLU-52          [-1, 144, 28, 28]               0
AdaptiveAvgPool2d-53            [-1, 144, 1, 1]               0
           Conv2d-54              [-1, 6, 1, 1]             870
             SiLU-55              [-1, 6, 1, 1]               0
           Conv2d-56            [-1, 144, 1, 1]           1,008
          Sigmoid-57            [-1, 144, 1, 1]               0
SqueezeExcitation-58          [-1, 144, 28, 28]               0
           Conv2d-59           [-1, 40, 28, 28]           5,760
      BatchNorm2d-60           [-1, 40, 28, 28]              80
           MBConv-61           [-1, 40, 28, 28]               0
           Conv2d-62          [-1, 240, 28, 28]           9,600
      BatchNorm2d-63          [-1, 240, 28, 28]             480
             SiLU-64          [-1, 240, 28, 28]               0
           Conv2d-65          [-1, 240, 28, 28]           6,000
      BatchNorm2d-66          [-1, 240, 28, 28]             480
             SiLU-67          [-1, 240, 28, 28]               0
AdaptiveAvgPool2d-68            [-1, 240, 1, 1]               0
           Conv2d-69             [-1, 10, 1, 1]           2,410
             SiLU-70             [-1, 10, 1, 1]               0
           Conv2d-71            [-1, 240, 1, 1]           2,640
          Sigmoid-72            [-1, 240, 1, 1]               0
SqueezeExcitation-73          [-1, 240, 28, 28]               0
           Conv2d-74           [-1, 40, 28, 28]           9,600
      BatchNorm2d-75           [-1, 40, 28, 28]              80
  StochasticDepth-76           [-1, 40, 28, 28]               0
           MBConv-77           [-1, 40, 28, 28]               0
           Conv2d-78          [-1, 240, 28, 28]           9,600
      BatchNorm2d-79          [-1, 240, 28, 28]             480
             SiLU-80          [-1, 240, 28, 28]               0
           Conv2d-81          [-1, 240, 14, 14]           2,160
      BatchNorm2d-82          [-1, 240, 14, 14]             480
             SiLU-83          [-1, 240, 14, 14]               0
AdaptiveAvgPool2d-84            [-1, 240, 1, 1]               0
           Conv2d-85             [-1, 10, 1, 1]           2,410
             SiLU-86             [-1, 10, 1, 1]               0
           Conv2d-87            [-1, 240, 1, 1]           2,640
          Sigmoid-88            [-1, 240, 1, 1]               0
SqueezeExcitation-89          [-1, 240, 14, 14]               0
           Conv2d-90           [-1, 80, 14, 14]          19,200
      BatchNorm2d-91           [-1, 80, 14, 14]             160
           MBConv-92           [-1, 80, 14, 14]               0
           Conv2d-93          [-1, 480, 14, 14]          38,400
      BatchNorm2d-94          [-1, 480, 14, 14]             960
             SiLU-95          [-1, 480, 14, 14]               0
           Conv2d-96          [-1, 480, 14, 14]           4,320
      BatchNorm2d-97          [-1, 480, 14, 14]             960
             SiLU-98          [-1, 480, 14, 14]               0
AdaptiveAvgPool2d-99            [-1, 480, 1, 1]               0
          Conv2d-100             [-1, 20, 1, 1]           9,620
            SiLU-101             [-1, 20, 1, 1]               0
          Conv2d-102            [-1, 480, 1, 1]          10,080
         Sigmoid-103            [-1, 480, 1, 1]               0
SqueezeExcitation-104          [-1, 480, 14, 14]               0
          Conv2d-105           [-1, 80, 14, 14]          38,400
     BatchNorm2d-106           [-1, 80, 14, 14]             160
 StochasticDepth-107           [-1, 80, 14, 14]               0
          MBConv-108           [-1, 80, 14, 14]               0
          Conv2d-109          [-1, 480, 14, 14]          38,400
     BatchNorm2d-110          [-1, 480, 14, 14]             960
            SiLU-111          [-1, 480, 14, 14]               0
          Conv2d-112          [-1, 480, 14, 14]           4,320
     BatchNorm2d-113          [-1, 480, 14, 14]             960
            SiLU-114          [-1, 480, 14, 14]               0
AdaptiveAvgPool2d-115            [-1, 480, 1, 1]               0
          Conv2d-116             [-1, 20, 1, 1]           9,620
            SiLU-117             [-1, 20, 1, 1]               0
          Conv2d-118            [-1, 480, 1, 1]          10,080
         Sigmoid-119            [-1, 480, 1, 1]               0
SqueezeExcitation-120          [-1, 480, 14, 14]               0
          Conv2d-121           [-1, 80, 14, 14]          38,400
     BatchNorm2d-122           [-1, 80, 14, 14]             160
 StochasticDepth-123           [-1, 80, 14, 14]               0
          MBConv-124           [-1, 80, 14, 14]               0
          Conv2d-125          [-1, 480, 14, 14]          38,400
     BatchNorm2d-126          [-1, 480, 14, 14]             960
            SiLU-127          [-1, 480, 14, 14]               0
          Conv2d-128          [-1, 480, 14, 14]          12,000
     BatchNorm2d-129          [-1, 480, 14, 14]             960
            SiLU-130          [-1, 480, 14, 14]               0
AdaptiveAvgPool2d-131            [-1, 480, 1, 1]               0
          Conv2d-132             [-1, 20, 1, 1]           9,620
            SiLU-133             [-1, 20, 1, 1]               0
          Conv2d-134            [-1, 480, 1, 1]          10,080
         Sigmoid-135            [-1, 480, 1, 1]               0
SqueezeExcitation-136          [-1, 480, 14, 14]               0
          Conv2d-137          [-1, 112, 14, 14]          53,760
     BatchNorm2d-138          [-1, 112, 14, 14]             224
          MBConv-139          [-1, 112, 14, 14]               0
          Conv2d-140          [-1, 672, 14, 14]          75,264
     BatchNorm2d-141          [-1, 672, 14, 14]           1,344
            SiLU-142          [-1, 672, 14, 14]               0
          Conv2d-143          [-1, 672, 14, 14]          16,800
     BatchNorm2d-144          [-1, 672, 14, 14]           1,344
            SiLU-145          [-1, 672, 14, 14]               0
AdaptiveAvgPool2d-146            [-1, 672, 1, 1]               0
          Conv2d-147             [-1, 28, 1, 1]          18,844
            SiLU-148             [-1, 28, 1, 1]               0
          Conv2d-149            [-1, 672, 1, 1]          19,488
         Sigmoid-150            [-1, 672, 1, 1]               0
SqueezeExcitation-151          [-1, 672, 14, 14]               0
          Conv2d-152          [-1, 112, 14, 14]          75,264
     BatchNorm2d-153          [-1, 112, 14, 14]             224
 StochasticDepth-154          [-1, 112, 14, 14]               0
          MBConv-155          [-1, 112, 14, 14]               0
          Conv2d-156          [-1, 672, 14, 14]          75,264
     BatchNorm2d-157          [-1, 672, 14, 14]           1,344
            SiLU-158          [-1, 672, 14, 14]               0
          Conv2d-159          [-1, 672, 14, 14]          16,800
     BatchNorm2d-160          [-1, 672, 14, 14]           1,344
            SiLU-161          [-1, 672, 14, 14]               0
AdaptiveAvgPool2d-162            [-1, 672, 1, 1]               0
          Conv2d-163             [-1, 28, 1, 1]          18,844
            SiLU-164             [-1, 28, 1, 1]               0
          Conv2d-165            [-1, 672, 1, 1]          19,488
         Sigmoid-166            [-1, 672, 1, 1]               0
SqueezeExcitation-167          [-1, 672, 14, 14]               0
          Conv2d-168          [-1, 112, 14, 14]          75,264
     BatchNorm2d-169          [-1, 112, 14, 14]             224
 StochasticDepth-170          [-1, 112, 14, 14]               0
          MBConv-171          [-1, 112, 14, 14]               0
          Conv2d-172          [-1, 672, 14, 14]          75,264
     BatchNorm2d-173          [-1, 672, 14, 14]           1,344
            SiLU-174          [-1, 672, 14, 14]               0
          Conv2d-175            [-1, 672, 7, 7]          16,800
     BatchNorm2d-176            [-1, 672, 7, 7]           1,344
            SiLU-177            [-1, 672, 7, 7]               0
AdaptiveAvgPool2d-178            [-1, 672, 1, 1]               0
          Conv2d-179             [-1, 28, 1, 1]          18,844
            SiLU-180             [-1, 28, 1, 1]               0
          Conv2d-181            [-1, 672, 1, 1]          19,488
         Sigmoid-182            [-1, 672, 1, 1]               0
SqueezeExcitation-183            [-1, 672, 7, 7]               0
          Conv2d-184            [-1, 192, 7, 7]         129,024
     BatchNorm2d-185            [-1, 192, 7, 7]             384
          MBConv-186            [-1, 192, 7, 7]               0
          Conv2d-187           [-1, 1152, 7, 7]         221,184
     BatchNorm2d-188           [-1, 1152, 7, 7]           2,304
            SiLU-189           [-1, 1152, 7, 7]               0
          Conv2d-190           [-1, 1152, 7, 7]          28,800
     BatchNorm2d-191           [-1, 1152, 7, 7]           2,304
            SiLU-192           [-1, 1152, 7, 7]               0
AdaptiveAvgPool2d-193           [-1, 1152, 1, 1]               0
          Conv2d-194             [-1, 48, 1, 1]          55,344
            SiLU-195             [-1, 48, 1, 1]               0
          Conv2d-196           [-1, 1152, 1, 1]          56,448
         Sigmoid-197           [-1, 1152, 1, 1]               0
SqueezeExcitation-198           [-1, 1152, 7, 7]               0
          Conv2d-199            [-1, 192, 7, 7]         221,184
     BatchNorm2d-200            [-1, 192, 7, 7]             384
 StochasticDepth-201            [-1, 192, 7, 7]               0
          MBConv-202            [-1, 192, 7, 7]               0
          Conv2d-203           [-1, 1152, 7, 7]         221,184
     BatchNorm2d-204           [-1, 1152, 7, 7]           2,304
            SiLU-205           [-1, 1152, 7, 7]               0
          Conv2d-206           [-1, 1152, 7, 7]          28,800
     BatchNorm2d-207           [-1, 1152, 7, 7]           2,304
            SiLU-208           [-1, 1152, 7, 7]               0
AdaptiveAvgPool2d-209           [-1, 1152, 1, 1]               0
          Conv2d-210             [-1, 48, 1, 1]          55,344
            SiLU-211             [-1, 48, 1, 1]               0
          Conv2d-212           [-1, 1152, 1, 1]          56,448
         Sigmoid-213           [-1, 1152, 1, 1]               0
SqueezeExcitation-214           [-1, 1152, 7, 7]               0
          Conv2d-215            [-1, 192, 7, 7]         221,184
     BatchNorm2d-216            [-1, 192, 7, 7]             384
 StochasticDepth-217            [-1, 192, 7, 7]               0
          MBConv-218            [-1, 192, 7, 7]               0
          Conv2d-219           [-1, 1152, 7, 7]         221,184
     BatchNorm2d-220           [-1, 1152, 7, 7]           2,304
            SiLU-221           [-1, 1152, 7, 7]               0
          Conv2d-222           [-1, 1152, 7, 7]          28,800
     BatchNorm2d-223           [-1, 1152, 7, 7]           2,304
            SiLU-224           [-1, 1152, 7, 7]               0
AdaptiveAvgPool2d-225           [-1, 1152, 1, 1]               0
          Conv2d-226             [-1, 48, 1, 1]          55,344
            SiLU-227             [-1, 48, 1, 1]               0
          Conv2d-228           [-1, 1152, 1, 1]          56,448
         Sigmoid-229           [-1, 1152, 1, 1]               0
SqueezeExcitation-230           [-1, 1152, 7, 7]               0
          Conv2d-231            [-1, 192, 7, 7]         221,184
     BatchNorm2d-232            [-1, 192, 7, 7]             384
 StochasticDepth-233            [-1, 192, 7, 7]               0
          MBConv-234            [-1, 192, 7, 7]               0
          Conv2d-235           [-1, 1152, 7, 7]         221,184
     BatchNorm2d-236           [-1, 1152, 7, 7]           2,304
            SiLU-237           [-1, 1152, 7, 7]               0
          Conv2d-238           [-1, 1152, 7, 7]          10,368
     BatchNorm2d-239           [-1, 1152, 7, 7]           2,304
            SiLU-240           [-1, 1152, 7, 7]               0
AdaptiveAvgPool2d-241           [-1, 1152, 1, 1]               0
          Conv2d-242             [-1, 48, 1, 1]          55,344
            SiLU-243             [-1, 48, 1, 1]               0
          Conv2d-244           [-1, 1152, 1, 1]          56,448
         Sigmoid-245           [-1, 1152, 1, 1]               0
SqueezeExcitation-246           [-1, 1152, 7, 7]               0
          Conv2d-247            [-1, 320, 7, 7]         368,640
     BatchNorm2d-248            [-1, 320, 7, 7]             640
          MBConv-249            [-1, 320, 7, 7]               0
          Conv2d-250           [-1, 1280, 7, 7]         409,600
     BatchNorm2d-251           [-1, 1280, 7, 7]           2,560
            SiLU-252           [-1, 1280, 7, 7]               0
AdaptiveAvgPool2d-253           [-1, 1280, 1, 1]               0
         Dropout-254                 [-1, 1280]               0
          Linear-255                 [-1, 1000]       1,281,000
================================================================
Total params: 5,288,548
Trainable params: 5,288,548
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.57
Forward/backward pass size (MB): 173.65
Params size (MB): 20.17
Estimated Total Size (MB): 194.40
----------------------------------------------------------------
None


URL = "https://edunet.kea.su/repo/EduNet-web_dependencies/L09/cat.jpeg"
!wget -q $URL -O image.jpg


from torchvision import utils, transforms
import matplotlib.pyplot as plt
import torch
from PIL import Image

img = Image.open("image.jpg")

transform = transforms.Compose([transforms.Resize((256, 256)), transforms.ToTensor()])

img = transform(img)
patches = []
sz = 64
for r in range(0, img.shape[1], sz):
    for c in range(0, img.shape[2], sz):
        patches.append(img[:, r : r + sz, c : c + sz])

patches = torch.stack(patches).type(torch.float)

img_grid = utils.make_grid(patches, pad_value=10, normalize=True, nrow=4)
plt.imshow(transforms.ToPILImage()(img_grid).convert("RGB"))
plt.axis("off")
plt.show()


plt.figure(figsize=(18, 6))
img_grid = utils.make_grid(patches, pad_value=10, normalize=True, nrow=256 // 16)
plt.imshow(transforms.ToPILImage()(img_grid).convert("RGB"))
plt.axis("off");


import torch
import torch.nn as nn


class SelfAttention(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.input_dim = input_dim
        self.query = nn.Linear(input_dim, input_dim)
        self.key = nn.Linear(input_dim, input_dim)
        self.value = nn.Linear(input_dim, input_dim)

    def forward(self, x):
        queries = self.query(x)
        keys = self.key(x)
        values = self.value(x)
        scores = torch.bmm(queries, keys.transpose(1, 2)) / (self.input_dim**0.5)
        attention = scores.softmax(dim=2)
        print("Scores shape", scores.shape)
        weighted = torch.bmm(attention, values)
        return weighted


embed_dim = 256
self_attention_layer = SelfAttention(embed_dim)
dummy_x = torch.randn(1, 4 * 4, embed_dim)  # Batch_size x Sequence_len x Embedding_size
out = self_attention_layer(dummy_x)
print(out.shape)

Scores shape torch.Size([1, 16, 16])
torch.Size([1, 16, 256])


!pip install -q pytorch_pretrained_vit

  Preparing metadata (setup.py) ... done
  Building wheel for pytorch_pretrained_vit (setup.py) ... done


from pytorch_pretrained_vit import ViT
from torchvision import transforms

model = ViT("B_16_imagenet1k", pretrained=True)
model.eval()

Downloading: "https://github.com/lukemelas/PyTorch-Pretrained-ViT/releases/download/0.0.2/B_16_imagenet1k.pth" to /root/.cache/torch/hub/checkpoints/B_16_imagenet1k.pth
100%|██████████| 331M/331M [00:02<00:00, 122MB/s]

Loaded pretrained weights.

ViT(
  (patch_embedding): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
  (positional_embedding): PositionalEmbedding1D()
  (transformer): Transformer(
    (blocks): ModuleList(
      (0-11): 12 x Block(
        (attn): MultiHeadedSelfAttention(
          (proj_q): Linear(in_features=768, out_features=768, bias=True)
          (proj_k): Linear(in_features=768, out_features=768, bias=True)
          (proj_v): Linear(in_features=768, out_features=768, bias=True)
          (drop): Dropout(p=0.1, inplace=False)
        )
        (proj): Linear(in_features=768, out_features=768, bias=True)
        (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (pwff): PositionWiseFeedForward(
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
        )
        (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (drop): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (norm): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
  (fc): Linear(in_features=768, out_features=1000, bias=True)
)


# Load image
!wget -q https://edunet.kea.su/repo/EduNet-web_dependencies/L09/capybara.jpg


capybara_in_pil = Image.open("capybara.jpg")
transforms = transforms.Compose(
    [
        transforms.Resize((384, 384)),
        transforms.ToTensor(),
        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
    ]
)
capybara_in_tensor = transforms(capybara_in_pil)
print(capybara_in_tensor.shape)  # torch.Size([1, 3, 384, 384])

# Classify
with torch.no_grad():
    outputs = model(capybara_in_tensor.unsqueeze(0))
print(outputs.shape)  # (1, 1000)

torch.Size([3, 384, 384])
torch.Size([1, 1000])


top3 = outputs[0].topk(3).indices
top3 = top3.tolist()


print("Top 3 predictions:")
for class_num in top3:
    print(class_num, classes[class_num])
display(capybara_in_pil.resize((384, 384)))

Top 3 predictions:
337 beaver
336 marmot
338 guinea_pig


!pip install -q timm

     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.2/2.2 MB 10.1 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 268.8/268.8 kB 11.5 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.3/1.3 MB 17.6 MB/s eta 0:00:00


import torch

model = torch.hub.load(
    "facebookresearch/deit:main", "deit_tiny_patch16_224", pretrained=True
)

Downloading: "https://github.com/facebookresearch/deit/zipball/main" to /root/.cache/torch/hub/main.zip
Downloading: "https://dl.fbaipublicfiles.com/deit/deit_tiny_patch16_224-a1311bcf.pth" to /root/.cache/torch/hub/checkpoints/deit_tiny_patch16_224-a1311bcf.pth
100%|██████████| 21.9M/21.9M [00:00<00:00, 29.5MB/s]


!wget -q https://edunet.kea.su/repo/EduNet-web_dependencies/L09/capybara.jpg


from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
import torchvision.transforms as T
from PIL import Image

pil = Image.open("capybara.jpg")

# create the data transform that DeiT expects
imagenet_transform = T.Compose(
    [
        T.Resize((224, 224)),
        T.ToTensor(),
        T.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
    ]
)

out = model(imagenet_transform(pil).unsqueeze(0))
print(out.shape)
pil.resize((224, 224))

torch.Size([1, 1000])


print(model)

VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 192, kernel_size=(16, 16), stride=(16, 16))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (patch_drop): Identity()
  (norm_pre): Identity()
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=192, out_features=576, bias=True)
        (q_norm): Identity()
        (k_norm): Identity()
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=192, out_features=192, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): Identity()
      (drop_path1): Identity()
      (norm2): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=192, out_features=768, bias=True)
        (act): GELU(approximate='none')
        (drop1): Dropout(p=0.0, inplace=False)
        (norm): Identity()
        (fc2): Linear(in_features=768, out_features=192, bias=True)
        (drop2): Dropout(p=0.0, inplace=False)
      )
      (ls2): Identity()
      (drop_path2): Identity()
    )
    (1): Block(
      (norm1): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=192, out_features=576, bias=True)
        (q_norm): Identity()
        (k_norm): Identity()
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=192, out_features=192, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): Identity()
      (drop_path1): Identity()
      (norm2): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=192, out_features=768, bias=True)
        (act): GELU(approximate='none')
        (drop1): Dropout(p=0.0, inplace=False)
        (norm): Identity()
        (fc2): Linear(in_features=768, out_features=192, bias=True)
        (drop2): Dropout(p=0.0, inplace=False)
      )
      (ls2): Identity()
      (drop_path2): Identity()
    )
    (2): Block(
      (norm1): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=192, out_features=576, bias=True)
        (q_norm): Identity()
        (k_norm): Identity()
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=192, out_features=192, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): Identity()
      (drop_path1): Identity()
      (norm2): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=192, out_features=768, bias=True)
        (act): GELU(approximate='none')
        (drop1): Dropout(p=0.0, inplace=False)
        (norm): Identity()
        (fc2): Linear(in_features=768, out_features=192, bias=True)
        (drop2): Dropout(p=0.0, inplace=False)
      )
      (ls2): Identity()
      (drop_path2): Identity()
    )
    (3): Block(
      (norm1): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=192, out_features=576, bias=True)
        (q_norm): Identity()
        (k_norm): Identity()
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=192, out_features=192, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): Identity()
      (drop_path1): Identity()
      (norm2): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=192, out_features=768, bias=True)
        (act): GELU(approximate='none')
        (drop1): Dropout(p=0.0, inplace=False)
        (norm): Identity()
        (fc2): Linear(in_features=768, out_features=192, bias=True)
        (drop2): Dropout(p=0.0, inplace=False)
      )
      (ls2): Identity()
      (drop_path2): Identity()
    )
    (4): Block(
      (norm1): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=192, out_features=576, bias=True)
        (q_norm): Identity()
        (k_norm): Identity()
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=192, out_features=192, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): Identity()
      (drop_path1): Identity()
      (norm2): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=192, out_features=768, bias=True)
        (act): GELU(approximate='none')
        (drop1): Dropout(p=0.0, inplace=False)
        (norm): Identity()
        (fc2): Linear(in_features=768, out_features=192, bias=True)
        (drop2): Dropout(p=0.0, inplace=False)
      )
      (ls2): Identity()
      (drop_path2): Identity()
    )
    (5): Block(
      (norm1): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=192, out_features=576, bias=True)
        (q_norm): Identity()
        (k_norm): Identity()
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=192, out_features=192, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): Identity()
      (drop_path1): Identity()
      (norm2): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=192, out_features=768, bias=True)
        (act): GELU(approximate='none')
        (drop1): Dropout(p=0.0, inplace=False)
        (norm): Identity()
        (fc2): Linear(in_features=768, out_features=192, bias=True)
        (drop2): Dropout(p=0.0, inplace=False)
      )
      (ls2): Identity()
      (drop_path2): Identity()
    )
    (6): Block(
      (norm1): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=192, out_features=576, bias=True)
        (q_norm): Identity()
        (k_norm): Identity()
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=192, out_features=192, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): Identity()
      (drop_path1): Identity()
      (norm2): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=192, out_features=768, bias=True)
        (act): GELU(approximate='none')
        (drop1): Dropout(p=0.0, inplace=False)
        (norm): Identity()
        (fc2): Linear(in_features=768, out_features=192, bias=True)
        (drop2): Dropout(p=0.0, inplace=False)
      )
      (ls2): Identity()
      (drop_path2): Identity()
    )
    (7): Block(
      (norm1): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=192, out_features=576, bias=True)
        (q_norm): Identity()
        (k_norm): Identity()
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=192, out_features=192, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): Identity()
      (drop_path1): Identity()
      (norm2): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=192, out_features=768, bias=True)
        (act): GELU(approximate='none')
        (drop1): Dropout(p=0.0, inplace=False)
        (norm): Identity()
        (fc2): Linear(in_features=768, out_features=192, bias=True)
        (drop2): Dropout(p=0.0, inplace=False)
      )
      (ls2): Identity()
      (drop_path2): Identity()
    )
    (8): Block(
      (norm1): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=192, out_features=576, bias=True)
        (q_norm): Identity()
        (k_norm): Identity()
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=192, out_features=192, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): Identity()
      (drop_path1): Identity()
      (norm2): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=192, out_features=768, bias=True)
        (act): GELU(approximate='none')
        (drop1): Dropout(p=0.0, inplace=False)
        (norm): Identity()
        (fc2): Linear(in_features=768, out_features=192, bias=True)
        (drop2): Dropout(p=0.0, inplace=False)
      )
      (ls2): Identity()
      (drop_path2): Identity()
    )
    (9): Block(
      (norm1): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=192, out_features=576, bias=True)
        (q_norm): Identity()
        (k_norm): Identity()
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=192, out_features=192, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): Identity()
      (drop_path1): Identity()
      (norm2): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=192, out_features=768, bias=True)
        (act): GELU(approximate='none')
        (drop1): Dropout(p=0.0, inplace=False)
        (norm): Identity()
        (fc2): Linear(in_features=768, out_features=192, bias=True)
        (drop2): Dropout(p=0.0, inplace=False)
      )
      (ls2): Identity()
      (drop_path2): Identity()
    )
    (10): Block(
      (norm1): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=192, out_features=576, bias=True)
        (q_norm): Identity()
        (k_norm): Identity()
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=192, out_features=192, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): Identity()
      (drop_path1): Identity()
      (norm2): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=192, out_features=768, bias=True)
        (act): GELU(approximate='none')
        (drop1): Dropout(p=0.0, inplace=False)
        (norm): Identity()
        (fc2): Linear(in_features=768, out_features=192, bias=True)
        (drop2): Dropout(p=0.0, inplace=False)
      )
      (ls2): Identity()
      (drop_path2): Identity()
    )
    (11): Block(
      (norm1): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=192, out_features=576, bias=True)
        (q_norm): Identity()
        (k_norm): Identity()
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=192, out_features=192, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): Identity()
      (drop_path1): Identity()
      (norm2): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=192, out_features=768, bias=True)
        (act): GELU(approximate='none')
        (drop1): Dropout(p=0.0, inplace=False)
        (norm): Identity()
        (fc2): Linear(in_features=768, out_features=192, bias=True)
        (drop2): Dropout(p=0.0, inplace=False)
      )
      (ls2): Identity()
      (drop_path2): Identity()
    )
  )
  (norm): LayerNorm((192,), eps=1e-06, elementwise_affine=True)
  (fc_norm): Identity()
  (head_drop): Dropout(p=0.0, inplace=False)
  (head): Linear(in_features=192, out_features=1000, bias=True)
)


print(model.head)

Linear(in_features=192, out_features=1000, bias=True)


model.head = torch.nn.Linear(192, 10, bias=True)


out = model(imagenet_transform(pil).unsqueeze(0))
print(out.shape)

torch.Size([1, 10])


from torchvision.datasets import CIFAR10
from torch.utils.data import DataLoader

cifar10 = CIFAR10(root="./", train=True, download=True, transform=imagenet_transform)

# We use only part of CIFAR10 to reduce training time
trainset, _ = torch.utils.data.random_split(cifar10, [10000, 40000])
train_loader = DataLoader(trainset, batch_size=128, shuffle=True, num_workers=2)

testset = CIFAR10(root="./", train=False, download=True, transform=imagenet_transform)
test_loader = DataLoader(testset, batch_size=128, shuffle=False, num_workers=2)

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./cifar-10-python.tar.gz

100%|██████████| 170498071/170498071 [00:02<00:00, 81264865.69it/s]

Extracting ./cifar-10-python.tar.gz to ./
Files already downloaded and verified


from torch import nn
from tqdm.notebook import tqdm_notebook

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def train(model, train_loader, optimizer, num_epochs=1):
    model.to(device)
    model.train()
    criterion = nn.CrossEntropyLoss()

    for epoch in range(num_epochs):
        for batch in tqdm_notebook(train_loader):
            inputs, labels = batch
            optimizer.zero_grad()
            outputs = model(inputs.to(device))
            loss = criterion(outputs, labels.to(device))
            loss.backward()
            optimizer.step()


import torch.optim as optim

model.to(device)
optimizer = optim.SGD(model.head.parameters(), lr=0.001, momentum=0.9)
train(model, train_loader, optimizer)

  0%|          | 0/79 [00:00<?, ?it/s]


@torch.inference_mode()
def accuracy(model, testloader):
    correct = 0
    total = 0
    for batch in testloader:
        images, labels = batch
        outputs = model(images.to(device))
        # the class with the highest energy is what we choose as prediction
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels.to(device)).sum().item()
    return correct / total


print(f"Accuracy of fine-tuned network : {accuracy(model, test_loader):.2f} ")

Accuracy of fine-tuned network : 0.78


def get_model():
    model = torch.hub.load(
        "facebookresearch/deit:main", "deit_tiny_patch16_224", pretrained=True
    )
    model.head = torch.nn.Linear(192, 10, bias=True)
    return model


model = get_model()

Using cache found in /root/.cache/torch/hub/facebookresearch_deit_main


cifar_transform = T.Compose(
    [
        # T.Resize((224, 224)),    don't remove this line
        T.ToTensor(),
        T.Normalize(IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD),
    ]
)

# Change transformation in base dataset
cifar10.transform = cifar_transform
first_img = trainset[0][0]

model.to(torch.device("cpu"))
try:
    out = model(first_img.unsqueeze(0))
except Exception as e:
    print("Exception:", e)

Exception: Input image height (32) doesn't match model (224).


model.patch_embed.img_size = (32, 32)
try:
    out = model(first_img.unsqueeze(0))
except Exception as e:
    print("Exception:", e)

Exception: The size of tensor a (5) must match the size of tensor b (197) at non-singleton dimension 1


model.pos_embed.data.shape

torch.Size([1, 197, 192])


model.pos_embed.data = model.pos_embed.data[:, :5, :]
out = model(first_img.unsqueeze(0))
print(out.shape)

torch.Size([1, 10])


cifar10.transform = cifar_transform
train_loader = DataLoader(cifar10, batch_size=512, shuffle=True, num_workers=2)

# Now we train all parameters because model altered
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
train(model, train_loader, optimizer)

  0%|          | 0/98 [00:00<?, ?it/s]


testset.transform = cifar_transform
print(f"Accuracy of altered network : {accuracy(model,test_loader):.2f} ")

Accuracy of altered network : 0.53


from matplotlib import pyplot as plt

logits = [0.1, 0.1, 0.4, 5, 0.1, 0.2, 0.1, 0.2, 3, 0.7]

plt.figure(figsize=(6, 3))
plt.bar(range(0, 10), logits)
plt.xticks(range(0, 10))
plt.show()


import torch
from torch.nn.functional import softmax

probs = softmax(torch.tensor(logits), dim=0)

plt.figure(figsize=(6, 3))
plt.bar(range(0, 10), probs)
plt.xticks(range(0, 10))
plt.show()


one_hot = (probs >= probs.max()).int()
print("One hot ", one_hot)

One hot  tensor([0, 0, 0, 1, 0, 0, 0, 0, 0, 0], dtype=torch.int32)


import numpy as np

f, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 4), sharey=False)


def bar(ax, y, title):
    x = range(0, 10)
    ax.bar(x, np.array(y))
    ax.set_title(title)


bar(ax1, logits, "Logits")
bar(ax2, probs, "Probs")
bar(ax3, one_hot, "Label")

plt.show()


!pip install -q byol-pytorch


import torch
from byol_pytorch import BYOL
from torchvision import models
from torchvision.datasets import ImageFolder, DatasetFolder
from tqdm import tqdm
from warnings import simplefilter

simplefilter("ignore", UserWarning)

resnet = models.resnet50(weights=None)

learner = BYOL(resnet, image_size=256, hidden_layer="avgpool")

learner.to(device)
opt = torch.optim.Adam(learner.parameters(), lr=3e-4)


def sample_unlabelled_images():
    return torch.randn(20, 3, 256, 256)


for _ in tqdm(range(3)):
    images = sample_unlabelled_images()
    loss = learner(images.to(device))
    opt.zero_grad()
    loss.backward()
    opt.step()
    learner.update_moving_average()  # update moving average of target encoder

# save your improved network
torch.save(resnet.state_dict(), "./improved-net.pt")

100%|██████████| 3/3 [00:02<00:00,  1.27it/s]


!pip install -q git+https://github.com/openai/CLIP.git

  Preparing metadata (setup.py) ... done
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 53.1/53.1 kB 1.5 MB/s eta 0:00:00
  Building wheel for clip (setup.py) ... done


img, class_num = microImgNet[200]
show(img, microImgNet.labels[200][1], 0)


texts = [
    "A man with a gasoline saw is getting firewood",
    "Santa Claus sleigh",
    "chain saw",
    "cat",
    "dog",
]


import clip

print(clip.available_models())

['RN50', 'RN101', 'RN50x4', 'RN50x16', 'RN50x64', 'ViT-B/32', 'ViT-B/16', 'ViT-L/14', 'ViT-L/14@336px']


import clip

model, preprocess = clip.load("ViT-B/32", device="cpu")

100%|███████████████████████████████████████| 338M/338M [00:04<00:00, 76.2MiB/s]


image = preprocess(img).unsqueeze(0)
text = clip.tokenize(texts)


with torch.no_grad():
    logits_per_image, _ = model(image, text)
    probs = logits_per_image.softmax(dim=-1).numpy()

print(probs)

plt.figure(figsize=(6, 3))
plt.bar(range(len(texts)), probs.flatten())
plt.show()

[[9.9127918e-01 5.4925272e-06 8.7150745e-03 5.6151421e-08 2.9277066e-07]]


image_features = model.encode_image(image).detach().cpu()
text_features = model.encode_text(text).detach().cpu()
print("Image", image_features.shape)
print("Text", text_features.shape)

Image torch.Size([1, 512])
Text torch.Size([5, 512])


print(np.linalg.norm(image_features[0]))

10.16952


from torch.nn.functional import normalize

image_features = normalize(image_features)
text_features = normalize(text_features)

print(np.linalg.norm(image_features[0].cpu()))
print(np.linalg.norm(text_features.cpu(), axis=1))

0.99999994
[1. 1. 1. 1. 1.]


similarities = []
for t in text_features:
    sim = torch.dot(image_features[0], t)
    similarities.append(sim.item())
print(similarities)

[0.3043336272239685, 0.18330001831054688, 0.25699421763420105, 0.1374691277742386, 0.153982475399971]


sims = torch.matmul(text_features, image_features.T)
print(sims.detach().cpu().tolist())

[[0.3043336570262909], [0.18330001831054688], [0.25699421763420105], [0.1374691277742386], [0.1539824903011322]]


plt.figure(figsize=(6, 3))
plt.bar(range(len(similarities)), similarities)
plt.show()


chilled_sims = sims.flatten() * 100


s = chilled_sims.softmax(dim=0).numpy()
print(s)
plt.figure(figsize=(6, 4))
plt.bar(range(len(s)), s)
plt.show()

[9.9127930e-01 5.4925276e-06 8.7150587e-03 5.6151642e-08 2.9277098e-07]


descriptions = []
for val in imagenet_labels.values():
    name = val[1].replace("_", " ")
    descriptions.append(f"a photo of {name}")
print(descriptions[0:10])

['a photo of tench', 'a photo of goldfish', 'a photo of great white shark', 'a photo of tiger shark', 'a photo of hammerhead', 'a photo of electric ray', 'a photo of stingray', 'a photo of cock', 'a photo of hen', 'a photo of ostrich']


import clip

img, label = microImgNet[0]

model, preprocess = clip.load("ViT-B/32", device=device)


for i in range(6):
    img, label = microImgNet[i * 6]
    name = microImgNet.labels[i * 6][1]

    image = preprocess(img).unsqueeze(0).to(device)
    text = clip.tokenize(descriptions).to(device)

    with torch.no_grad():
        image_features = model.encode_image(image)
        text_features = model.encode_text(text)

        logits_per_image, logits_per_text = model(image, text)
        probs = logits_per_image.softmax(dim=-1).cpu().numpy()
        class_num = probs.argmax()
        descr = descriptions[class_num]
        show(img, descr, i)

Базовые компоненты свёрточных сетей¶

ImageNet¶

Метрики Top 1 and Top5¶

Baseline (AlexNet 2012)¶

Тюнинг гиперпараметров (ZFnet)¶

Базовый блок (VGGNet 2014)¶

Вычислительные ресурсы¶

Дополнительная информация¶

Фильтры 3x3¶

Память для хранения параметров (VGG16)¶

Контроль GPU¶

Inception module (GoogLeNet 2014)¶

Дополнительная информация¶

1x1 Convolution¶

"Stem network"¶

Global Average Pooling¶

Дополнительная информация про GAP¶

Затухание градиента¶

Batchnorm (революция глубины)¶

Skip connection (ResNet 2015)¶

Архитектура ResNet¶

Stage ratio¶

BasicBlock в PyTorch¶

Bottleneck layer¶

Обучение ResNet¶

Обогащение карт признаков¶

Grouped Convolution (ResNeXt 2016)¶

Grouped Convolution in PyTorch¶

ResNeXt = Skip connection + Inception + Grouped convolution¶

Обзор сети MobileNet (2017 г.)¶

Depthwise separable convolution¶

Inverted residual block¶

Сравнение моделей¶

Много skip connection (DenseNet 2016)¶

Ширина вместо глубины (WideResNet 2016)¶

Squeeze-and-Excitation (SENet 2017)¶

Поиск хорошей архитектуры¶

Neural Architecture Search¶

Обзор сети EfficientNet (2019 г.)¶

Self Attention (ViT 2020)¶

Недостатки сверточного слоя¶

Self-attention¶

Сравнение со сверткой¶

Как получить веса внимания?¶

Соображения относительно размера patch¶

Position embedding¶

Архитектура ViT¶

Предсказание с помощью ViT¶

Обучение ViT¶

Объем данных и ресурсов¶

DeiT: Data-efficient Image Transformers¶

Использование ViT с собственным датасетом¶

Изменение размеров входа ViT¶

ConvNext (2022)¶

Процесс обучения¶

Ablation study¶

Model soups (2022)¶

Обучение без разметки¶

Дистилляция¶

Hard targets¶

Soft targets¶

Feature extraction¶

BYOL¶

CLIP¶

Как использовать¶

Получение embedding¶

Классификация ImageNet¶

Практические рекомендации¶