leondgarse / keras_cv_attention_models Goto Github PK

Keras beit,caformer,CMT,CoAtNet,convnext,davit,dino,efficientdet,edgenext,efficientformer,efficientnet,eva,fasternet,fastervit,fastvit,flexivit,gcvit,ghostnet,gpvit,hornet,hiera,iformer,inceptionnext,lcnet,levit,maxvit,mobilevit,moganet,nat,nfnets,pvt,swin,tinynet,tinyvit,uniformer,volo,vanillanet,yolor,yolov7,yolov8,yolox,gpt2,llama2, alias kecam

License: MIT License

Python 100.00%

attention clip coco ddpm detection imagenet keras model recognition segment-anything stable-diffusion tensorflow tf tf2 visualizing

keras_cv_attention_models's People

Contributors

Stargazers

Watchers

Forkers

awsaf49 lockejiang alanmorninglight xuefuzhao sivaramakrishnan-rajaraman yiweichen04 duwizerak wahyurahmaniar aascode billalkuet07 qaz001313 nivratti zerosum0x00 shkns aifahim daniel-bru dcleres naren2000 mahesh-ram1 whyuek its-jd nguyenhoangthuan99 1kelloe xinsuinizhuan francescorubbo mw66 juanlp mazino69 marquisvictor junjie2008v yuichiro-kaneoka mikeyhv qbil-org whalefa1i ymcidence johnypark frankyhlucky garrettbingham doncarlos999 sheirving majid5776 vishwanathan007 aminhp avber vadimcurca gouzi668 ibraharsye awesomepc ajunlonglive junayed-rahman medicmind vertyxzz hoyso48 131404060321 pauloctopuszlwb girinchutia spatializer ilknuraktemur joessattes keepersecond omarsinno-oreyeon vijaydl happyfeng1212 hthloveydh guoyanannan andreped lzziuhh hellcodes invoxiahja pragmatism0220 peytontolbert arungansi kkkkkkkkk16 whuhxb zaza-97 lililibin2022 ibinti khanhdat111 sifubro drassaadz dengwenboo gjp4tw neverstoplearn

keras_cv_attention_models's Issues

Update for EdgeNeXt

I reproduced EdgeNeXt based on torch and your project，
Is there any mistake with this code？ Why can't it show all layers details，looks like it's missing some layers in “summary”

import tensorflow as tf
from tensorflow import keras
from keras_cv_attention_models.common_layers import (
    layer_norm, activation_by_name
)
from tensorflow.keras import initializers
from keras_cv_attention_models.attention_layers import (
    conv2d_no_bias,
    drop_block,
)
import math

BATCH_NORM_DECAY = 0.9
BATCH_NORM_EPSILON = 1e-5
TF_BATCH_NORM_EPSILON = 0.001
LAYER_NORM_EPSILON = 1e-5


@tf.keras.utils.register_keras_serializable(package="EdgeNeXt")
class PositionalEncodingFourier(keras.layers.Layer):
    def __init__(self, hidden_dim=32, dim=768, temperature=10000):
        super(PositionalEncodingFourier, self).__init__()
        self.token_projection = tf.keras.layers.Conv2D(dim, kernel_size=1)
        self.scale = 2 * math.pi
        self.temperature = temperature
        self.hidden_dim = hidden_dim
        self.dim = dim
        self.eps = 1e-6

    def __call__(self, B, H, W, *args, **kwargs):
        mask_tf = tf.zeros([B, H, W])
        not_mask_tf = 1 - mask_tf
        y_embed_tf = tf.cumsum(not_mask_tf, axis=1)
        x_embed_tf = tf.cumsum(not_mask_tf, axis=2)
        y_embed_tf = y_embed_tf / (y_embed_tf[:, -1:, :] + self.eps) * self.scale  # 2 * math.pi
        x_embed_tf = x_embed_tf / (x_embed_tf[:, :, -1:] + self.eps) * self.scale  # 2 * math.pi
        dim_t_tf = tf.range(self.hidden_dim, dtype=tf.float32)
        dim_t_tf = self.temperature ** (2 * (dim_t_tf // 2) / self.hidden_dim)
        pos_x_tf = x_embed_tf[:, :, :, None] / dim_t_tf
        pos_y_tf = y_embed_tf[:, :, :, None] / dim_t_tf
        pos_x_tf = tf.reshape(tf.stack([tf.math.sin(pos_x_tf[:, :, :, 0::2]),
                                        tf.math.cos(pos_x_tf[:, :, :, 1::2])], axis=4),
                              shape=[B, H, W, self.hidden_dim])
        pos_y_tf = tf.reshape(tf.stack([tf.math.sin(pos_y_tf[:, :, :, 0::2]),
                                        tf.math.cos(pos_y_tf[:, :, :, 1::2])], axis=4),
                              shape=[B, H, W, self.hidden_dim])
        pos_tf = tf.concat([pos_y_tf, pos_x_tf], axis=-1)
        pos_tf = self.token_projection(pos_tf)

        return pos_tf

    def get_config(self):
        base_config = super().get_config()
        base_config.update({"token_projection": self.token_projection, "scale": self.scale,
                            "temperature": self.temperature, "hidden_dim": self.hidden_dim,
                            "dim": self.dim, "eps": self.eps})
        return base_config


def EdgeNeXt(input_shape=(256, 256, 3), depths=[3, 3, 9, 3], dims=[24, 48, 88, 168],
             global_block=[0, 0, 0, 3], global_block_type=['None', 'None', 'None', 'SDTA'],
             drop_path_rate=1, layer_scale_init_value=1e-6, head_init_scale=1., expan_ratio=4,
             kernel_sizes=[7, 7, 7, 7], heads=[8, 8, 8, 8], use_pos_embd_xca=[False, False, False, False],
             use_pos_embd_global=False, d2_scales=[2, 3, 4, 5], epsilon=1e-6, model_name='EdgeNeXt'):
    inputs = keras.layers.Input(input_shape, batch_size=2)

    nn = conv2d_no_bias(inputs, dims[0], kernel_size=4, strides=4, padding="valid", name="stem_")
    nn = layer_norm(nn, epsilon=epsilon, name='stem_')

    drop_connect_rates = tf.linspace(0, stop=drop_path_rate, num=int(
        sum(depths)))  # drop_connect_rates_split(num_blocks, start=0.0, end=drop_connect_rate)
    cur = 0
    for i in range(4):
        for j in range(depths[i]):
            if j > depths[i] - global_block[i] - 1:
                if global_block_type[i] == 'SDTA':
                    SDTA_encoder(dim=dims[i], drop_path=drop_connect_rates[cur + j],
                                 expan_ratio=expan_ratio, scales=d2_scales[i],
                                 use_pos_emb=use_pos_embd_xca[i], num_heads=heads[i], name='stage_'+str(i)+'_SDTA_encoder_'+str(j))(nn)
                else:
                    raise NotImplementedError
            else:
                if i != 0 and j == 0:
                    nn = layer_norm(nn, epsilon=epsilon, name='stage_' + str(i) + '_')
                    nn = conv2d_no_bias(nn, dims[i], kernel_size=2, strides=2, padding="valid",
                                        name='stage_' + str(i) + '_')

                Conv_Encoder(dim=dims[i], drop_path=drop_connect_rates[cur + j],
                             layer_scale_init_value=layer_scale_init_value,
                             expan_ratio=expan_ratio, kernel_size=kernel_sizes[i], name='stage_'+str(i)+'_Conv_Encoder_'+str(j) + '_')(nn)  # drop_connect_rates[cur + j]

    model = keras.models.Model(inputs, nn, name=model_name)
    return model


@tf.keras.utils.register_keras_serializable(package="EdgeNeXt")
class Conv_Encoder(keras.layers.Layer):
    def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6, expan_ratio=4, kernel_size=7, epsilon=1e-6,
                 name=''):

        super(Conv_Encoder, self).__init__()
        self.encoder_name = name
        self.gamma = tf.Variable(layer_scale_init_value * tf.ones(dim), trainable=True,
                                 name=name + 'gamma') if layer_scale_init_value > 0 else None
        self.drop_path = drop_path
        self.dim = dim
        self.expan_ratio = expan_ratio
        self.kernel_size = kernel_size
        self.epsilon = epsilon

    def __call__(self, x, *args, **kwargs):
        inputs = x
        x = keras.layers.Conv2D(self.dim, kernel_size=self.kernel_size, padding="SAME", name=self.encoder_name +'Conv2D')(x)
        x = layer_norm(x, epsilon=self.epsilon, name=self.encoder_name)
        x = keras.layers.Dense(self.expan_ratio * self.dim)(x)
        x = activation_by_name(x, activation="gelu")
        x = keras.layers.Dense(self.dim)(x)
        if self.gamma is not None:
            x = self.gamma * x

        x = inputs + drop_block(x, drop_rate=0.)

        return x

    def get_config(self):
        base_config = super().get_config()
        base_config.update({"gamma": self.gamma, "drop_path": self.drop_path,
                            "dim": self.dim, "expan_ratio": self.expan_ratio,
                            "kernel_size": self.kernel_size})
        return base_config


@tf.keras.utils.register_keras_serializable(package="EdgeNeXt")
class SDTA_encoder(keras.layers.Layer):
    def __init__(self, dim, drop_path=0., layer_scale_init_value=1e-6, expan_ratio=4,
                 use_pos_emb=True, num_heads=8, qkv_bias=True, attn_drop=0., drop=0., scales=1, zero_gamma=False,
                 activation='gelu', use_bias=False, name='sdf'):
        super(SDTA_encoder, self).__init__()
        self.expan_ratio = expan_ratio
        self.width = max(int(math.ceil(dim / scales)), int(math.floor(dim // scales)))
        self.width_list = [self.width] * (scales - 1)
        self.width_list.append(dim - self.width * (scales - 1))
        self.dim = dim
        self.scales = scales
        if scales == 1:
            self.nums = 1
        else:
            self.nums = scales - 1
        self.pos_embd = None
        if use_pos_emb:
            self.pos_embd = PositionalEncodingFourier(dim=dim)
        self.xca = XCA(dim, num_heads=num_heads, qkv_bias=qkv_bias, attn_drop=attn_drop, proj_drop=drop)
        self.gamma_xca = tf.Variable(layer_scale_init_value * tf.ones(dim), trainable=True,
                                     name=name + 'gamma') if layer_scale_init_value > 0 else None
        self.gamma = tf.Variable(layer_scale_init_value * tf.ones(dim), trainable=True,
                                 name=name + 'gamma') if layer_scale_init_value > 0 else None
        self.drop_rate = drop_path
        self.drop_path = keras.layers.Dropout(drop_path)
        gamma_initializer = tf.zeros_initializer() if zero_gamma else tf.ones_initializer()
        self.norm = keras.layers.LayerNormalization(epsilon=LAYER_NORM_EPSILON, gamma_initializer=gamma_initializer,
                                                    name=name and name + "ln")
        self.norm_xca = keras.layers.LayerNormalization(epsilon=LAYER_NORM_EPSILON, gamma_initializer=gamma_initializer,
                                                        name=name and name + "norm_xca")
        self.activation = activation
        self.use_bias = use_bias

    def get_config(self):
        base_config = super().get_config()
        base_config.update({"width": self.width, "dim": self.dim,
                            "nums": self.nums, "pos_embd": self.pos_embd,
                            "xca": self.xca, "gamma_xca": self.gamma_xca,
                            "gamma": self.gamma, "norm": self.norm,
                            "activation": self.activation, "use_bias": self.use_bias,
                            })
        return base_config

    def __call__(self, inputs, *args, **kwargs):
        x = inputs
        spx = tf.split(inputs, self.width_list, axis=-1)
        for i in range(self.nums):
            if i == 0:
                sp = spx[i]
            else:
                sp = sp + spx[i]
            sp = keras.layers.Conv2D(self.width, kernel_size=3, padding='SAME')(sp)  # , groups=self.width
            if i == 0:
                out = sp
            else:
                out = tf.concat([out, sp], -1)
        inputs = tf.concat([out, spx[self.nums]], -1)

        # XCA
        B, H, W, C = inputs.shape
        inputs = tf.reshape(inputs, (-1, H * W, C))  # tf.transpose(), perm=[0, 2, 1])

        if self.pos_embd:
            pos_encoding = tf.reshape(self.pos_embd(B, H, W), (-1, H * W, C))
            inputs += pos_encoding

        if self.gamma_xca is not None:
            inputs = self.gamma_xca * inputs
        input_xca = self.gamma_xca * self.xca(self.norm_xca(inputs))
        inputs = inputs + drop_block(input_xca, drop_rate=self.drop_rate, name="SDTA_encoder_")
        inputs = tf.reshape(inputs, (-1, H, W, C))

        # Inverted Bottleneck
        inputs = self.norm(inputs)
        inputs = keras.layers.Conv2D(self.expan_ratio * self.dim, kernel_size=1, use_bias=self.use_bias)(inputs)
        inputs = activation_by_name(inputs, activation=self.activation)
        inputs = keras.layers.Conv2D(self.dim, kernel_size=1, use_bias=self.use_bias)(inputs)
        if self.gamma is not None:
            inputs = self.gamma * inputs

        x = x + self.drop_path(inputs)
        return x


@tf.keras.utils.register_keras_serializable(package="EdgeNeXt")
class XCA(keras.layers.Layer):
    def __init__(self, dim, num_heads=8, qkv_bias=False, attn_drop=0., proj_drop=0., name=""):
        super(XCA, self).__init__()
        self.num_heads = num_heads
        self.temperature = tf.Variable(tf.ones(num_heads, 1, 1), trainable=True, name=name + 'gamma')

        self.qkv = keras.layers.Dense(dim * 3, use_bias=qkv_bias)
        self.attn_drop = keras.layers.Dropout(attn_drop)
        self.k_ini = initializers.GlorotUniform()
        self.b_ini = initializers.Zeros()
        self.proj = keras.layers.Dense(dim, name="out",
                                       kernel_initializer=self.k_ini, bias_initializer=self.b_ini)
        self.proj_drop = keras.layers.Dropout(proj_drop)

    def __call__(self, inputs, training=None, *args, **kwargs):
        input_shape = inputs.shape
        qkv = self.qkv(inputs)
        qkv = tf.reshape(qkv, (input_shape[0], input_shape[1], 3,
                               self.num_heads,
                               input_shape[2] // self.num_heads))  # [batch, hh * ww, 3, num_heads, dims_per_head]
        qkv = tf.transpose(qkv, perm=[2, 0, 3, 4, 1])  # [3, batch, num_heads, dims_per_head, hh * ww]
        query, key, value = tf.split(qkv, 3, axis=0)  # [batch, num_heads, dims_per_head, hh * ww]

        norm_query, norm_key = tf.nn.l2_normalize(tf.squeeze(query), axis=-1, epsilon=1e-6), \
                               tf.nn.l2_normalize(tf.squeeze(key), axis=-1, epsilon=1e-6)
        attn = tf.matmul(norm_query, norm_key, transpose_b=True)
        attn = tf.transpose(tf.transpose(attn, perm=[0, 2, 3, 1]) * self.temperature, perm=[0, 3, 2, 1])

        attn = tf.nn.softmax(attn, axis=-1)
        attn = self.attn_drop(attn, training=training)  # [batch, num_heads, hh * ww, hh * ww]

        x = tf.matmul(attn, value)  # [batch, num_heads, hh * ww, dims_per_head]
        x = tf.reshape(x, [input_shape[0], input_shape[1], input_shape[2]])

        x = self.proj(x)
        x = self.proj_drop(x)

        return x

    def get_config(self):
        base_config = super().get_config()
        base_config.update({"num_heads": self.num_heads, "temperature": self.temperature,
                            "qkv": self.qkv, "attn_drop": self.attn_drop,
                            "proj": self.proj, "proj_drop": self.proj_drop})
        return base_config


def edgenext_xx_small(pretrained=False, **kwargs):
    # 1.33M & 260.58M @ 256 resolution
    # 71.23% Top-1 accuracy
    # No AA, Color Jitter=0.4, No Mixup & Cutmix, DropPath=0.0, BS=4096, lr=0.006, multi-scale-sampler
    # Jetson FPS=51.66 versus 47.67 for MobileViT_XXS
    # For A100: FPS @ BS=1: 212.13 & @ BS=256: 7042.06 versus FPS @ BS=1: 96.68 & @ BS=256: 4624.71 for MobileViT_XXS
    model = EdgeNeXt(depths=[2, 2, 6, 2], dims=[24, 48, 88, 168], expan_ratio=4,
                     global_block=[0, 1, 1, 1],
                     global_block_type=['None', 'SDTA', 'SDTA', 'SDTA'],
                     use_pos_embd_xca=[False, True, False, False],
                     kernel_sizes=[3, 5, 7, 9],
                     heads=[4, 4, 4, 4],
                     d2_scales=[2, 2, 3, 4],
                     **kwargs)

    return model


def edgenext_x_small(pretrained=False, **kwargs):
    # 2.34M & 538.0M @ 256 resolution
    # 75.00% Top-1 accuracy
    # No AA, No Mixup & Cutmix, DropPath=0.0, BS=4096, lr=0.006, multi-scale-sampler
    # Jetson FPS=31.61 versus 28.49 for MobileViT_XS
    # For A100: FPS @ BS=1: 179.55 & @ BS=256: 4404.95 versus FPS @ BS=1: 94.55 & @ BS=256: 2361.53 for MobileViT_XS
    model = EdgeNeXt(depths=[3, 3, 9, 3], dims=[32, 64, 100, 192], expan_ratio=4,
                     global_block=[0, 1, 1, 1],
                     global_block_type=['None', 'SDTA', 'SDTA', 'SDTA'],
                     use_pos_embd_xca=[False, True, False, False],
                     kernel_sizes=[3, 5, 7, 9],
                     heads=[4, 4, 4, 4],
                     d2_scales=[2, 2, 3, 4],
                     **kwargs)

    return model


def edgenext_small(pretrained=False, **kwargs):
    # 5.59M & 1260.59M @ 256 resolution
    # 79.43% Top-1 accuracy
    # AA=True, No Mixup & Cutmix, DropPath=0.1, BS=4096, lr=0.006, multi-scale-sampler
    # Jetson FPS=20.47 versus 18.86 for MobileViT_S
    # For A100: FPS @ BS=1: 172.33 & @ BS=256: 3010.25 versus FPS @ BS=1: 93.84 & @ BS=256: 1785.92 for MobileViT_S
    model = EdgeNeXt(depths=[3, 3, 9, 3], dims=[48, 96, 160, 304], expan_ratio=4,
                     global_block=[0, 1, 1, 1],
                     global_block_type=['None', 'SDTA', 'SDTA', 'SDTA'],
                     use_pos_embd_xca=[False, True, False, False],
                     kernel_sizes=[3, 5, 7, 9],
                     d2_scales=[2, 2, 3, 4],
                     **kwargs)

    return model


if __name__ == '__main__':
    model = edgenext_small()
    model.summary()
    # from download_and_load import keras_reload_from_torch_model
    # keras_reload_from_torch_model(
    #     'D:\GitHub\EdgeNeXt\edgenext_small.pth',
    #     keras_model=model,
    #     # tail_align_dict=tail_align_dict,
    #     # full_name_align_dict=full_name_align_dict,
    #     # additional_transfer=additional_transfer,
    #     input_shape=(256, 256),
    #     do_convert=True,
    #     save_name="adaface_ir101_webface4m.h5",
    # )



```

coatnet, bias weights per head

Your coatnet seems to only create 1 bias weight set when the paper says 1 per head

Failed to initialize CoAtNet0 class

After installing keras-cv-attention-models library using pip. I tried to import CoAtNet.

from keras_cv_attention_models import coatnet

co_model = coatnet.CoAtNet0()

Then it gives me this error:

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
/tmp/ipykernel_554883/3784669224.py in <module>
----> 1 co_model = coatnet.CoAtNet0()

/opt/conda/lib/python3.8/site-packages/keras_cv_attention_models/coatnet/coatnet.py in CoAtNet0(input_shape, num_classes, drop_connect_rate, classifier_activation, pretrained, **kwargs)
    213     out_channels = [96, 192, 384, 768]
    214     stem_width = 64
--> 215     return CoAtNet(**locals(), model_name="coatnet0", **kwargs)
    216 
    217 

/opt/conda/lib/python3.8/site-packages/keras_cv_attention_models/coatnet/coatnet.py in CoAtNet(num_blocks, out_channels, stem_width, block_types, strides, expansion, se_ratio, head_dimension, use_dw_strides, bn_act_first, input_shape, num_classes, activation, drop_connect_rate, classifier_activation, dropout, pretrained, model_name, kwargs)
    192                 )
    193             else:
--> 194                 nn = res_mhsa(nn, out_channel, conv_short_cut, stride, head_dimension, block_drop_rate, activation=activation, name=name)
    195                 nn = res_ffn(nn, expansion=expansion, drop_rate=block_drop_rate, activation=activation, name=name + "ffn_")
    196 

/opt/conda/lib/python3.8/site-packages/keras_cv_attention_models/coatnet/coatnet.py in res_mhsa(inputs, output_channel, conv_short_cut, strides, head_dimension, drop_rate, activation, name)
    138         nn = keras.layers.MaxPool2D(pool_size=2, strides=strides, padding="SAME", name=name + "pool")(nn)
    139     num_heads = nn.shape[-1] // head_dimension
--> 140     nn = mhsa_with_multi_head_relative_position_embedding(nn, num_heads=num_heads, key_dim=head_dimension, out_shape=output_channel, name=name + "mhsa_")
    141     nn = drop_block(nn, drop_rate=drop_rate, name=name)
    142     # print(f"{name = }, {inputs.shape = }, {shortcut.shape = }, {nn.shape = }")

/opt/conda/lib/python3.8/site-packages/keras_cv_attention_models/coatnet/coatnet.py in mhsa_with_multi_head_relative_position_embedding(inputs, num_heads, key_dim, global_query, out_shape, out_weight, qkv_bias, out_bias, attn_dropout, name)
     23     _, hh, ww, cc = inputs.shape
     24     key_dim = key_dim if key_dim > 0 else cc // num_heads
---> 25     qk_scale = float(1.0 / tf.math.sqrt(tf.cast(key_dim, "float32")))
     26     out_shape = cc if out_shape is None or not out_weight else out_shape
     27     qk_out = num_heads * key_dim

TypeError: float() argument must be a string or a number, not 'Tensor'

I used Tensorflow 2.10 and Python 3.8. Tried it using Tensorflow 2.9 but no luck. Is there a way to fix it?

How to save models ?

@leondgarse I want to save the models in saved_model format. How to do that?
When I am attempting it, it is showing me the error

WARNING:tensorflow:Compiled the loaded model, but the compiled metrics have yet to be built. `model.compile_metrics` will be empty until you train or evaluate the model.

What can be the soluion for this?

Code:

import os
from keras_cv_attention_models import mobilevit
pretrained = '/content/mobilevit_xxs_imagenet.h5'
model = mobilevit.MobileViT_XXS(pretrained=pretrained)
model.save('mobilevit_xxs_imagenet1k')

coat.CoaTMini(input_shape=(200, 240, 1) ...) error: Dimensions must be equal, but are 730 and 677 for ...

Hi,

I try to train a model with:

    model = coat.CoaTMini(input_shape=(200, 240, 1), num_classes=240, pretrained=None)

but the model cannot be build, error out with:

ValueError: Exception encountered when calling layer "tf.__operators__.add" (type TFOpLambda).

Dimensions must be equal, but are 730 and 677 for '{{node tf.__operators__.add/AddV2}} = AddV2[T=DT_FLOAT](Placeholder, Placeholder_1)' with input shapes: [?,730,216], [?,677,216].

Call arguments received:
  • x=tf.Tensor(shape=(None, 730, 216), dtype=float32)
  • y=tf.Tensor(shape=(None, 677, 216), dtype=float32)
  • name=None

I just wonder something wrong with coat?

Thanks.

More issues

MaxViT, HorNet, SwinTransformerV2 and BeiT don't work, I keep getting an issue while in the respective python files.

Can you provide the code for converting pytorch weights to tf?

Can you provide the code for converting pytorch weights to tf, such as beit. Because I wanted to try the effect of beitv2's pre-training weights.

DINO v2 backbone

Would like to add DINOv2 backbone.

paper: https://arxiv.org/abs/2304.07193
github: https://github.com/facebookresearch/dinov2

mish activation

In testing my data with ResNest, mish activation performs better than the default. However, it requires a change to activation_by_name, as follows:

import tensorflow_addons as tfa
...
elif activation:
if activation == "mish":
return keras.layers.Activation(activation=tfa.activations.mish, name=layer_name)(inputs)
else:
return keras.layers.Activation(activation=activation, name=layer_name)(inputs)

UserWarning raised when I instantiate a model

Hello

When I instantiate a template with this code :

from keras_cv_attention_models import efficientnet basic_model = efficientnet.EfficientNetV2B0(input_shape=(112, 112, 3), num_classes=0)

I get this warning:

UserWarning: The initializer VarianceScaling is unseeded and being called multiple times, which will return identical values each time (even if the initializer is unseeded). Please update your code to provide a seed to the initializer, or avoid using the same initalizer instance more than once. f"The initializer {self.__class__.__name__} is unseeded "

The warning is quite clear and a fix doesn't seem too painful

eva02 fp16 not working

The problems are the lines keras_cv_attention_models/beit/beit.py:88-89
if there we change float32 to float16 it starts working with fp16

Model conversion error (MobileViT)

Hi @leondgarse,

I feel sorry to write another issue regarding model conversion.

The reason is because I start facing an error after updating keras_cv_attention_models package to the latest one which you currently committed for tflite conversion.

I saw that you added a reshaping line before self attention layer.
But after this, an error occurs like below.

/usr/local/lib/python3.8/dist-packages/tensorflow/python/saved_model/save.py:1369:0: note: Error code: ERROR_NEEDS_FLEX_OPS
<unknown>:0: error: failed while converting: 'main': 
Some ops are not supported by the native TFLite runtime, you can enable TF kernels fallback using TF Select. See instructions: https://www.tensorflow.org/lite/guide/ops_select 
TF Select ops: Conv2D
Details:
	tf.Conv2D(tensor<?x?x?x?xf32>, tensor<1x1x128x256xf32>) -> (tensor<?x?x?x256xf32>) : {data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true}

just FYI, before this version package, when I tried model conversion with TFLiteConverter, there hasn't been an error and converted well.

Can you check this issue from your side?

Thanks,

Detection models conversion from pytorch

You state in the YOLOR page (https://github.com/leondgarse/keras_cv_attention_models/tree/main/keras_cv_attention_models/yolor) that "Model weights converted from official publication." Similar statements are made for YOLOX and EfficientDet.

I couldn't find in the repo the means for conversion. Am I missing it here and if it's not available, could you share how these weights were converted.

~~I am also assuming that the provided training scripts do not provide for training the detector models. If I am mistaken, any guidance on this would be greatly appreciated.~~ I am mistaken, it's clear in the README! sorry for the confusion. Nevertheless the above question of model conversion still stands.

Thank you for making available this amazing repo.

Poor results with custom detector YOLOR with efficientdet backbone

Used efficientdet backbone for YOLOR but results are very poor!
(backbone loads COCO trained weights)

Keras-cv

Are you interested to contribute some reusable component to the new Keras-cv library?

ModuleNotFoundError: No module named 'keras_cv_attention_models.maxvit'

When trying to import MaxViT_Tiny like so

from keras_cv_attention_models.maxvit import MaxViT_Tiny

I receive the following error message

Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
ModuleNotFoundError: No module named 'keras_cv_attention_models.maxvit'

Note that this does not happen when I try to import LeViT128. See the original issue posted here.

TPU support for VOLO

While trying VOLO with TPU I'm getting this error, any idea how to reolve this?

InvalidArgumentError: 9 root error(s) found.
  (0) Invalid argument: {{function_node __inference_train_function_137027}} Compilation failure: Detected unsupported operations when trying to compile graph cluster_train_function_5876961707884240013[] on XLA_TPU_JIT: ExtractImagePatches (No registered 'ExtractImagePatches' OpKernel for XLA_TPU_JIT devices compatible with node {{node gradient_tape/model/unfold_matmul_fold_3/ExtractImagePatches}}
	 (OpKernel was found, but attributes didn't match) Requested Attributes: T=DT_INT64, _xla_inferred_shapes=[[1,?,?,9]], ksizes=[1, 3, 3, 1], padding="VALID", rates=[1, 1, 1, 1], strides=[1, 2, 2, 1], _device="/device:TPU_REPLICATED_CORE"){{node gradient_tape/model/unfold_matmul_fold_3/ExtractImagePatches}}One approach is to outside compile the unsupported ops to run on CPUs by enabling soft placement `tf.config.set_soft_device_placement(True)`. This has a potential performance penalty.
	TPU compilation failed
	 [[tpu_compile_succeeded_assert/_17543318848583046929/_5]]
	 [[tpu_compile_succeeded_assert/_17543318848583046929/_5/_127]]
  (1) Invalid argument: {{function_node __inference_train_function_137027}} Compilation failure: Detected unsupported operations when trying to compile graph cluster_train_function_5876961707884240013[] on XLA_TPU_JIT: ExtractImagePatches (No registered 'ExtractImagePatches' OpKernel for XLA_TPU_JIT devices compatible with node {{node gradient_tape/model/unfold_matmul_fold_3/ExtractImagePatches}}
	 (OpKernel was found, but attributes didn't match) Requested Attributes: T=DT_INT64, _xla_inferred_shapes=[[1,?,?,9]], ksizes=[1, 3, 3, 1], padding="VALID", rates=[1, 1, 1, 1], strides=[1, 2, 2, 1], _device="/device:TPU_REPLICATED_CORE"){{node gradient_tape/model/unfold_matmul_fold_3/ExtractImagePatches}}One approach is to outside compile the unsupported ops to run on CPUs by enabling soft placement `tf.config.set_soft_device_placement(True)`. This has a potential performance penalty.
	TPU compilation failed
	 [[tpu_compile_succeeded_assert/_17543318848583046929/_5]]
	 [[tpu_compile_succeeded_assert/_17543318848583046929/_5/_103]]
  (2) Invalid argument: {{function_node __inference_train_function_137027}} Compilation failure: Detected unsupported operations when trying to compile graph cluster_train_function_5876961707884240013[] on XLA_TPU_JIT: ExtractImagePatches (No registered 'ExtractImagePatches' OpKernel for XLA_TPU_JIT devices compatible with node {{node gradient_tape/model/unfold_matmul_fold_3/ExtractImagePatches}}
	 (OpKernel was found, but attributes didn't match) Requested Attributes: T=DT_INT64, _xla_inferred_shapes=[[1,?,?,9]], ksizes=[1, 3, 3, 1], padding="VALID", rates=[1, 1, 1, 1], strides=[1, 2, 2, 1], _device="/device:TPU_REPLICATED_CORE"){{node gradient_tape/model/unfold_matmul_fold_3/ExtractImagePatches}}One approach is to outside compile the unsupported ops to run on CPUs by enabling soft placement `tf.config.set_soft_device_placement(True)`. This has a potential performance penalty.
	TPU compilation failed
	 [[tpu_compile_succeeded_assert/_17543318848583046929 ... [truncated]

MobileNetV3Large100- miil: large_100_miil.h5 weights suddenly disappeared and model loads with random initialization

I was able to use the weights 2 hours ago, I changed computer, now when I try to use the model as:

model = mobilenetv3.MobileNetV3Large100(pretrained="miil_21k")

No pretrained available, model will be randomly initialized

When I try to download weights by hand, I get page not found error.

Convnextv2 custom pretraining

How would one do custom pretraining with the convnextv2 model. I recall in the paper, they used sparse convolutions and masked autoencoding framework?

Training from scratch

Hi,
I am reaching out to inquire about the possibility of collaborating on some Language Model projects. I believe our combined skills and knowledge could lead to exciting developments in the field. To streamline our collaboration and foster communication among potential team members, I would like to propose creating a dedicated channel on either Discord or Telegram. This channel would serve as a platform where we can easily share ideas, discuss project details, and collaborate effectively.

Please let me know if you are interested in collaborating and if you would be open to creating a channel on either Discord or Telegram. I am excited about the potential of our collaboration and look forward to hearing your thoughts.

Can you provide the code for converting pytorch weights to tf?

Hi. Can you provide the code for converting pytorch weights to tf, such as beit. Because I wanted to try the effect of beitv2's pre-training weights.
Thanks!

Error loading coatnet model after the training process

I do transfer learning on CoAtNet0 in TF,

First I loaded the model from keras_cv_attention_models.coatnet, and save if after a few epochs. than I loaded the save model with my code to set some layers as trainble. after few epochs I saved again the model, and try to load it with exactly same code as before, just with set more layers as trainble,

but now I got that error:


Traceback (most recent call last):
  File "train.py", line 88, in <module>
    new_model = tf.keras.models.load_model(saved_model_path)
  File "/home/.local/lib/python3.8/site-packages/keras/utils/traceback_utils.py", line 67, in error_handler
    raise e.with_traceback(filtered_tb) from None
  File "/home/.local/lib/python3.8/site-packages/tensorflow/python/eager/function.py", line 2749, in canonicalize_function_inputs
    raise TypeError(f"{self.signature_summary()} missing 1 required "
TypeError: Exception encountered when calling layer "stack_3_block_1_mhsa_pos_emb" (type beit>MultiHeadRelativePositionalEmbedding).

f(attention_scores, training, training, training, training, *, training, training) missing 1 required argument: training.

Call arguments received:
  • args=('tf.Tensor(shape=(None, 6, 196, 196), dtype=float32)',)
  • kwargs={'training': 'False'}
Exception ignored in: <function Pool.__del__ at 0x7f789376b8b0>
Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/pool.py", line 268, in __del__
  File "/usr/lib/python3.8/multiprocessing/queues.py", line 362, in put
AttributeError: 'NoneType' object has no attribute 'dumps'

I loaded the model at first:

model_base = coatnet.CoAtNet0(input_shape=(224, 224, 3))

after the training, the next loading, I do in that way:

new_model = tf.keras.models.load_model(saved_model_path)

than after another some epochs, I tried to load the model again with the 2 lines above, and got the error I mentioned.

I would appreciate any help

Visualize saliency map with the attention models

It would be great if some functional code could be included for plotting attention maps using the attention models. Such a functionality has been provided for the vision transformer models at https://github.com/faustomorales/vit-keras. Thanks and looking forward.

Wrong Values running eval_script.py on MobileVit

Hi,

i am running the evaluation on a couple of models and I am facing a Problem.

running :"eval_script.py -m checkpoints/mobilevit_s_imagenet.h5"
results in ">>>> Accuracy top1: 0.00086 top5: 0.00508" for every model variation of MobileVit that is provided on your GitHub.

can you reproduce these values?
Other models like MobileNet/LCNet are running just fine resulting in reasonable results like ">>>> Accuracy top1: 0.75024 top5: 0.92168"

Error while evaluating LeVit

Hi again,

I came across this error when trying to eval the LeVit models.
Is there a possibile fix for this?

(tf) ..\keras_cv_attention_models-main>python eval_script.py -m checkpoints/levit128_imagenet.h5
Traceback (most recent call last):
File "..\keras_cv_attention_models-main\eval_script.py", line 60, in
model = tf.keras.models.load_model(args.model_path, compile=False)
File "..\Miniconda3\envs\tf\lib\site-packages\keras\utils\traceback_utils.py", line 67, in error_handler
raise e.with_traceback(filtered_tb) from None
File "..\Miniconda3\envs\tf\lib\site-packages\keras\utils\generic_utils.py", line 709, in deserialize_keras_object
raise ValueError(
ValueError: Unknown activation function: levit>hard_swish. Please ensure this object is passed to the custom_objects argument. See https://www.tensorflow.org/guide/keras/save_and_serialize#registering_the_custom_object for details.

Your Github repository

Hi,

No issues here but I wanted to thank you for your amazing Github repositories. It is the best Github page for me in my research. You are my first reference whenever I want to start any research.

How can I reference your work if I used some of your codes?

Thanks.

Future Plans

Hi,

Any plans for implementing Object Trackers?
Or Pyramid Vision Transformer (https://github.com/whai362/PVT)?

Thanks!

[General Questions] Rough estimates for training time for pre-training CoAtNet?

Hi, 👋 Thanks for such an amazing library and taking out the time to implement so many parts of the CoatNet paper!

In your CoAtNet README, you mentioned you use TPU accelerators. Could you provide a ballpark for the amount of time it took for you to train the biggest models and the corresponding accelerators? I have a task for which I wish to use scaled-up models, but I'd have to pre-train on Imagenet first because of low data amount (<5-10M) and squeeze out maximum accuracy from fine-tuning.

I assume there might've been a few bottlenecks also, perhaps data? 🤔 If you could describe your setup, it would be very helpful to my experiments!

Sorry for bothering you with minor questions, and again thank you for all your work!

tflite conversion - GPU/XNNPACK fails

Hi!
Thanks for great repo!
I have converted the EfficientFormer model to tflite. However, applying both XNNPACK and GPU delegates fail.

GPU delegate created.
INFO: Initialized TensorFlow Lite runtime.
INFO: Created TensorFlow Lite delegate for GPU.
Failed to apply GPU delegate.
Benchmarking failed.

XNNPACK delegate created.
INFO: Initialized TensorFlow Lite runtime.
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
Failed to apply XNNPACK delegate.
Benchmarking failed.

Do you know what could be the issue? Im using latest tensorflow version for conversion.

Use YoloR with swin transformer as backbone.

@leondgarse I am trying to get inference using yolor with swin backbone but getting the following results. What can be the issue?

from keras_cv_attention_models import efficientnet, yolor
from keras_cv_attention_models import swin_transformer_v2

from keras_cv_attention_models import efficientnet, yolor
bb = swin_transformer_v2.SwinTransformerV2Small_window16(input_shape=(256, 256, 3), num_classes=1000)
model = yolor.YOLOR(backbone=bb) 

from keras_cv_attention_models import test_images
imm = test_images.dog_cat()
preds = model(model.preprocess_input(imm))
bboxs, lables, confidences = model.decode_predictions(preds)[0]

from keras_cv_attention_models.coco import data
data.show_image_with_bboxes(imm, bboxs, lables, confidences)

resulting output

custom layer issue at tflite conversion

Hi, thanks for the good references.

I have implemented MobileViT with your package, and tried to convert the trained model into tflite format.
At there, I met an error saying,

Unknown layer: Addons>GroupNormalization. Please ensure this object is passed to the `custom_objects` argument. See https://www.tensorflow.org/guide/keras/save_and_serialize#registering_the_custom_object for details

I tried to addd custom layer name as a parameter of model load, but still facing the issue.

model = tf.keras.models.load_model('./checkpoints/model_best.h5', custom_objects={'AttentionLayer': AttentionLayer})

Is there any way to solve this?

Thanks,

Evaluation of YOLOX with eval_script

This issue has two parts. For the first I propose a solution but include it here incase my proposed solution effects part 2.

Error in eval script when using anchor free mode.
Bad/erroneous results from evaluation

Part 1
When running the eval_func.py with the --use_anchor_free_mode flag the following error is raised.

>>>> Using anchor_free_mode decode_predictions: {'aspect_ratios': [1], 'num_scales': 1, 'anchor_scale': 1, 'grid_zero_start': True}
Traceback (most recent call last):
  File "./eval_script.py", line 154, in <module>
    run_coco_evaluation(
  File "/home/lookdeep/gits/keras_cv_attention_models/keras_cv_attention_models/coco/eval_func.py", line 216, in run_coco_evaluation
    pred_decoder = DecodePredictions(input_shape, pyramid_levels, **ANCHORS, use_object_scores=use_anchor_free_mode)
TypeError: __init__() got an unexpected keyword argument 'aspect_ratios'

The issue lies in DecodePredictions (keras_cv_attention_models/blob/main/keras_cv_attention_models/coco/eval_func.py#L27) whereby anchor-free keyword arguments are not specified.

Can be fixed with

diff --git a/keras_cv_attention_models/coco/eval_func.py b/keras_cv_attention_models/coco/eval_func.py
index faafb7a..414e3ee 100644
--- a/keras_cv_attention_models/coco/eval_func.py
+++ b/keras_cv_attention_models/coco/eval_func.py
@@ -24,7 +24,7 @@ class DecodePredictions:
     >>> # bboxes = array([[0.433231  , 0.54432285, 0.8778939 , 0.8187578 ]], dtype=float32), labels = array([17]), scores = array([0.85373735], dtype=float32)
     """
 
-    def __init__(self, input_shape=512, pyramid_levels=[3, 7], anchor_scale=4, use_anchor_free_mode=False, use_object_scores="auto", aspect_ratios=[1], num_scales=1, grid_zero_start=True):
+    def __init__(self, input_shape=512, pyramid_levels=[3, 7], anchor_scale=4, use_anchor_free_mode=False, use_object_scores="auto"):
         self.pyramid_levels = list(range(min(pyramid_levels), max(pyramid_levels) + 1))
         self.use_object_scores = use_anchor_free_mode if use_object_scores == "auto" else use_object_scores
         if use_anchor_free_mode:

Part 2

When attempting to evaluate yolox_s_coco.h5 I get poor results which don't line up with inference results. E.g.

CUDA_VISIBLE_DEVICES='0' python ./eval_script.py -d coco -m $HOME/.keras/models/yolox_s_coco.h5 --use_anchor_free_mode
... [stderr excluded]
>>>> COCO evaluation: coco/2017 - anchors: {'pyramid_levels': [3, 5], 'use_anchor_free_mode': True}
>>>> Using input_shape (640, 640) for Keras model.
>>>> rescale_mode: torch
>>>> Using anchor_free_mode decode_predictions: {'aspect_ratios': [1], 'num_scales': 1, 'anchor_scale': 1, 'grid_zero_start': True}
... [stderr excluded]
loading annotations into memory...
Done (t=0.82s)
creating index...
index created!
Loading and preparing results...
Converting ndarray to lists...
(195236, 7)
0/195236
DONE (t=1.39s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=23.52s).
Accumulating evaluation results...
DONE (t=3.52s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.000
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.000
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.000
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.000
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.000
 Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.000
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.000
 Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.001

image augmentation

Is it possible to turn off/customize image augmentation when training coco_train_script.py detection models?

Thanks

[CoAtNet] Strides should be used in the first conv layer for down-sampling in MBConv

From eq (5) in the paper, strides=2 is used in the first conv layer down-sampling in MBConv.

However, in line 78-80 of coatnet.py strides=1 is used in the first conv while strides=strides is used in the depth conv.

training yolov8 with anchor-free anchor mode

When I want to train yolov8 with considering anchor-free anchor mode an error corresponds with dimensions occurs. How can I solve this problem?

assigning class weight to each class during training and access to the each class map and recall

As it is obvious, access to the each class map and recall in an object detection task is so helpful. But, I cannot find a function that returns each class accuracy during testing in this repository. Can anybody help me in this regard?

Also, I want to assign class weight to each class during training to improve my object detection model by considering the importance of each class. I set “class_weights” in “model.fit” but I wasn’t successful. I will appreciate any guidance in this regard, too

Training pretrained models with new input_shape

Hi, How can I change the input shape of a model while preserving weights of layers and finetune?

I have tried methods that I've found online such as changing config files, but the input shape does not change.

Here is what I've tried:

model = #load some model

config = model.layers[0].get_config() #Get Input layer of pretrained model.

config['batch_input_shape'] = (None, 640, 384, 3) #default (None, 260, 260, 3)

print(config) #Prints {'batch_input_shape': (None, 640, 384, 3), 'dtype': 'float32', 'sparse': False, 'ragged': False, 'name': 'input_3'}

model.layers[0] = model.layers[0].from_config(config)

print(model.layers[0].get_config()) #Prints {'batch_input_shape': (None, 260, 260, 3), 'dtype': 'float32', 'sparse': False, 'ragged': False, 'name': 'input_3'}

#If I use somename = model.layers[0].from_config(config) and print, input_shape changes correctly but now I have to attach all other layers to "somename" and build a new model, which will lose the weights.

#Then to save the weights I use:
new_model = tf.keras.models.model_from_json(model.to_json())

for layer in new_model.layers:
layer.set_weights(model.get_layer(name=layer.name).get_weights())

#At the end I have new_model, which is exactly the same as the pretrained model with its weights.

What to do at this point to be able to change input_size of a model and finetune with initially saved weights preserved.

CoAt net not supported on TPUs

I was trying to train a CoAt-Net model for an image classification task on GCP TPU. It runs well on the CPU, but on TPU, it gives the following error. I had tried reducing the batch size to the minimum but it nothing works.

2022-09-02 19:38:45.583462: I tensorflow/core/tpu/graph_rewrite/encapsulate_tpu_computations_pass.cc:263] Subgraph fingerprint:11642772675375123700
2022-09-02 19:38:46.847862: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:801] model_pruner failed: Invalid argument: Graph does not contain terminal node AdamW/AdamW/AssignAddVariableOp.
2022-09-02 19:38:47.170376: E tensorflow/core/grappler/optimizers/meta_optimizer.cc:801] model_pruner failed: Invalid argument: Graph does not contain terminal node AdamW/AdamW/AssignAddVariableOp.
2022-09-02 19:38:51.274067: I tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc:433] TPU host compilation cache miss: cache_key(1580009469103833695), session_name()
2022-09-02 19:38:57.125584: I tensorflow/core/tpu/kernels/tpu_compile_op_common.cc:175] Compilation of 1580009469103833695 with session name took 5.847533388s and failed
2022-09-02 19:38:57.125669: F tensorflow/core/tpu/kernels/tpu_program_group.cc:86] Check failed: xla_tpu_programs.size() > 0 (0 vs. 0)
https://symbolize.stripped_domain/r/?trace=7f042ce6f03b,7f042ce6f0bf,7f01f4f98795,7f01fa3a30e5,7f01fa43fc29,7f01fa440719,7f01fa436f8e,7f01fa43961c,7f01f11ffc3f,7f01f11f2761,7f01fa38e594,7f01fa38c266,7f01f16b826e,7f042ce11608&map=96db535a1f615a0c65595f5b3174441305721aa0:7f01f2021000-7f0205c52450,5d7fef26a7a561e548b6ebf78e026bbc3632a592:7f01f07f2000-7f01f1f81fa0
*** SIGABRT received by PID 37158 (TID 37980) on cpu 88 from PID 37158; stack trace: ***
PC: @ 0x7f042ce6f03b (unknown) raise
@ 0x7f01efcc37c0 976 (unknown)
@ 0x7f042ce6f0c0 3888 (unknown)
@ 0x7f01f4f98796 896 tensorflow::tpu::TpuProgramGroup::Initialize()
@ 0x7f01fa3a30e6 1696 tensorflow::tpu::TpuCompilationCacheExternal::InitializeEntry()
@ 0x7f01fa43fc2a 1072 tensorflow::tpu::TpuCompilationCacheInterface::CompileIfKeyAbsentHelper()
@ 0x7f01fa44071a 128 tensorflow::tpu::TpuCompilationCacheInterface::CompileIfKeyAbsent()
@ 0x7f01fa436f8f 1280 tensorflow::tpu::TpuCompileOpKernelCommon::ComputeInternal()
@ 0x7f01fa43961d 608 tensorflow::tpu::TpuCompileOpKernelCommon::Compute()
@ 0x7f01f11ffc40 2544 tensorflow::(anonymous namespace)::ExecutorState<>::Process()
@ 0x7f01f11f2762 48 std::_Function_handler<>::_M_invoke()
@ 0x7f01fa38e595 160 Eigen::ThreadPoolTempl<>::WorkerLoop()
@ 0x7f01fa38c267 64 std::_Function_handler<>::_M_invoke()
@ 0x7f01f16b826f 96 tensorflow::(anonymous namespace)::PThread::ThreadFn()
@ 0x7f042ce11609 (unknown) start_thread
https://symbolize.stripped_domain/r/?trace=7f042ce6f03b,7f01efcc37bf,7f042ce6f0bf,7f01f4f98795,7f01fa3a30e5,7f01fa43fc29,7f01fa440719,7f01fa436f8e,7f01fa43961c,7f01f11ffc3f,7f01f11f2761,7f01fa38e594,7f01fa38c266,7f01f16b826e,7f042ce11608&map=96db535a1f615a0c65595f5b3174441305721aa0:7f01f2021000-7f0205c52450,5d7fef26a7a561e548b6ebf78e026bbc3632a592:7f01f07f2000-7f01f1f81fa0,213387360f3ec84daf60dfccf2f07dd7:7f01e2d23000-7f01f0001700
E0902 19:38:57.281421 37980 coredump_hook.cc:292] RAW: Remote crash data gathering hook invoked.
E0902 19:38:57.281439 37980 coredump_hook.cc:384] RAW: Skipping coredump since rlimit was 0 at process start.
E0902 19:38:57.281447 37980 client.cc:222] RAW: Coroner client retries enabled (b/136286901), will retry for up to 30 sec.
E0902 19:38:57.281450 37980 coredump_hook.cc:447] RAW: Sending fingerprint to remote end.
E0902 19:38:57.281456 37980 coredump_socket.cc:124] RAW: Stat failed errno=2 on socket /var/google/services/logmanagerd/remote_coredump.socket
E0902 19:38:57.281460 37980 coredump_hook.cc:451] RAW: Cannot send fingerprint to Coroner: [NOT_FOUND] Missing crash reporting socket. Is the listener running?
E0902 19:38:57.281463 37980 coredump_hook.cc:525] RAW: Discarding core.

GhostNetV2 DFC Module

Hi,
I hope you are well.

May I ask where could I find the DFC Module (horizontal and vertical FC) in the GhostNetV2.py script?
I fully understand GhostNetV1 (codewise thanks to your implementations), and I am trying now to understand GhostNetV2.

Thanks for your time and this amazing repository.

Where to download the trained weight of DaViT_L, DaViT_H, DaViT_G model variation ?

In this link, https://github.com/leondgarse/keras_cv_attention_models/tree/main/keras_cv_attention_models/davit
only have weight of DaViT_T, DaViT_S, DaViT_B, but NOT of DaViT_L, DaViT_H, DaViT_G

CIFAR

I only find the way to run Imagenet. Would you like to tell me how to run CIFAR?

Training of YoloXS Model on Coco dataset

Hi, I am currently reproducing the coco training on YoloXS model with line below:

python leondgarse/coco_train_script.py --det_header yolox.YOLOXS --data_name coco/2014 --batch_size 16

After my training using 30 epochs, I am getting poor result, as

# Show result
from keras_cv_attention_models.coco import data
data.show_image_with_bboxes(imm, bboxs, labels, confidences, num_classes=80)

Do I have anything configure wrongly? Or any suggestion could I change? Thanks!

Preprocessing for each network

Hi, thanks for the huge work you've done! The package is very helpful

I have an issue figuring out what preprocessing each network's inputs need. Is there a list or some other place where I can look up this info?

EfficientViT-B0/B1/B2/B3 Models

The original EfficientVit paper mentions 4 model, namely EfficientVit-B0/B1/B2/B3.

These models are published on the paper's official GitHub repository.

In the latest release EfficientVit-M1/M2/M3/M4/M5 were added, which does differ from the official models.

Would it be possible to add the EfficientVit-B0/B1/B2/B3 from their official GitHub repository?

Swinv2

Hi,
I have trained a Swin transformer but I am getting a weird error when trying to load the saved model (.h5) with keras load_model.

The traceback I get is this one:

File "/home/ubuntu/anaconda3/lib/python3.7/site-packages/keras/utils/traceback_utils.py", line 67, in error_handler
    raise e.with_traceback(filtered_tb) from None
  File "/home/ubuntu/anaconda3/lib/python3.7/site-packages/keras/layers/merge.py", line 96, in <setcomp>
    batch_sizes = {s[0] for s in input_shape if s} - {None}
TypeError: unhashable type: 'list'

Any idea what may be the cause?

Clarification on pretrained weight loading!

For pretrained model, the way you coded, HERE, seems little odd compare to the conventional way from keras.applicaiton. If we use

model = kcv_attn.swin_transformer_v2.SwinTransformerV2Large_window24(
            input_shape=(384, 384, 3),
            num_classes=64,
            classifier_activation="gelu",
            pretrained="imagenet22k",
        )

How the weight loading works here where the model architecture should be differ from imagenet-22k weight? Other word, for a different number of classes (num_classes != 1000), how pretrained weights are assigned?

if num_classes > 0:
   nn = keras.layers.GlobalAveragePooling2D(name="avg_pool")(nn)
   if drop_rate > 0:
        nn = keras.layers.Dropout(drop_rate, name="head_drop")(nn)
   nn = keras.layers.Dense(num_classes, .., activation=classifier_activation)(nn)

And what if we only need signal from gap layer (include_top=False)?

TinyViT: Fast Pretraining Distillation for Small Vision Transformers

Hi,

any plans on adding TinyViT: Fast Pretraining Distillation for Small Vision Transformers models ?

I especially look for the model:

TinyViT-11M-distill (ImageNet-21k), Input_Size 224x224, Top1Acc 83.2, Params 11M, GFLOPS 2.0

error during init in new version

Can you pls have a look: in eval_func.py

when calling
from keras_cv_attention_models import beit

I get:
File "/opt/conda/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3441, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)

File "/tmp/ipykernel_43/2056746280.py", line 2, in
from keras_cv_attention_models import beit

File "/opt/conda/lib/python3.7/site-packages/keras_cv_attention_models/init.py", line 26, in
from keras_cv_attention_models import imagenet

File "/opt/conda/lib/python3.7/site-packages/keras_cv_attention_models/imagenet/init.py", line 1, in
from keras_cv_attention_models.imagenet.eval_func import evaluation, plot_hists, combine_hist_into_one, parse_timm_log

File "", line 1
(train_epoch_end_pattern = )
^
SyntaxError: invalid syntax

The order of height and width seems wrong in `tf.meshgrid(range(height), range(width))`

In Line 44 of beit.py, you use tf.meshgrid(range(height), range(width)), while it should be tf.meshgrid(range(width), range(height)), isn't it?

When I ran the code from Line 44 to Line 52 with height=3 and width=4, it gives the output

[[17 16 15 10  9  8  3  2  1 -4 -5 -6]
 [18 17 16 11 10  9  4  3  2 -3 -4 -5]
 [19 18 17 12 11 10  5  4  3 -2 -3 -4]
 [24 23 22 17 16 15 10  9  8  3  2  1]
 [25 24 23 18 17 16 11 10  9  4  3  2]
 [26 25 24 19 18 17 12 11 10  5  4  3]
 [31 30 29 24 23 22 17 16 15 10  9  8]
 [32 31 30 25 24 23 18 17 16 11 10  9]
 [33 32 31 26 25 24 19 18 17 12 11 10]
 [38 37 36 31 30 29 24 23 22 17 16 15]
 [39 38 37 32 31 30 25 24 23 18 17 16]
 [40 39 38 33 32 31 26 25 24 19 18 17]], shape=(12, 12), dtype=int32)

which seems incorrect.

Of course, this is not a problem if you assume height==width, but I think tf.meshgrid(range(width), range(height)) gives more readability and can potentially prevent bugs if height != width is supported in the future.

MobileViT

Tried to run MobileViT_S model with input shape 256, 256, 3 and got the following error

UnimplementedError Traceback (most recent call last)
in ()
2
3 history = model.fit(get_training_dataset_with_oversample(repeat_dataset=True, oversample=True), steps_per_epoch=STEPS_PER_EPOCH, epochs=EPOCHS,
----> 4 validation_data=get_validation_dataset(), validation_steps=VALIDATION_STEPS)
5

1 frames
/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/ops.py in _numpy(self)
1189 return self._numpy_internal()
1190 except core._NotOkStatusException as e: # pylint: disable=protected-access
-> 1191 raise core._status_to_exception(e) from None # pylint: disable=protected-access
1192
1193 @Property

UnimplementedError: 9 root error(s) found.
(0) UNIMPLEMENTED: {{function_node __inference_train_function_1032011}} Dynamic input dimension to reshape that is both splitted and combined is not supported %dynamic-reshape.13585 = f32[<=32,16,4,2304]{3,2,1,0} dynamic-reshape(f32[<=1024,2,16,144]{3,1,2,0} %transpose.13551, s32[] %divide.13584, s32[] %reshape.13571, s32[] %reshape.13574, s32[] %reshape.13577), metadata={op_type="Reshape" op_name="while/body/_1/while/mobilevit_s/tf.reshape_1/Reshape"}
[[{{function_node while_body_1010992}}{{node while/TPUReplicateMetadata}}]]
(1) UNIMPLEMENTED: {{function_node __inference_train_function_1032011}} Dynamic input dimension to reshape that is both splitted and combined is not supported %dynamic-reshape.13585 = f32[<=32,16,4,2304]{3,2,1,0} dynamic-reshape(f32[<=1024,2,16,144]{3,1,2,0} %transpose.13551, s32[] %divide.13584, s32[] %reshape.13571, s32[] %reshape.13574, s32[] %reshape.13577), metadata={op_type="Reshape" op_name="while/body/_1/while/mobilevit_s/tf.reshape_1/Reshape"}
[[{{function_node while_body_1010992}}{{node while/TPUReplicateMetadata}}]]
[[while/body/_1/while/strided_slice_35/_445]]
(2) UNIMPLEMENTED: {{function_node __inference_train_function_1032011}} Dynamic input dimension to reshape that is both splitted and combined is not supported %dynamic-reshape.13585 = f32[<=32,16,4,2304]{3,2,1,0} dynamic-reshape(f32[<=1024,2,16,144]{3,1,2,0} %transpose.13551, s32[] %divide.13584, s32[] %reshape.13571, s32[] %reshape.13574, s32[] %reshape.13577), metadata={op_type="Reshape" op_name="while/body/_1/while/mobilevit_s/tf.reshape_1/Reshape"}
[[{{function_node while_body_1010992}}{{node while/TPUReplicateMetadata}}]]
[[while/body/_1/while/strided_slice_23/_381]]
(3) UNIMPLEMENTED: {{function_node __inference_train_function_1032011}} Dynamic input dimension to reshape that is both splitted and combined is not supported %dynamic-reshape.13585 = f32[<=32,16,4,2304]{3,2,1,0} dynamic-reshape(f32[<=1024,2,16,144]{3,1,2,0} %transpose.13551, s32[] %divide.13584, s32[] %reshape.13571, s32[] %reshape.13574, s32[] %reshape.13577), metadata={op_type="Reshape" op_name="while/body/_1/while/mobilevit_s/tf.reshape_1/Reshape"}
[[{{function_node while_body_1010992}}{{node while/TPUReplicateMetadata}}]]
[[while/body/_1/while/Pad_8/_407]]
(4) UNIMPLEMENTED: {{function_node __inference_train_function_1032011}} Dynamic input dimension to reshape that is both splitted and combined is not supported %dynamic-reshape.13585 = f32[<=32,16,4,2304]{3,2,1,0} dynamic-reshape(f32[<=1024,2,16,144]{3,1,2,0} %transpose.13551, s32[] %divide.13584, s32[] %reshape.13571, s32[] %reshape.13574, s32[] %reshape.13577), metadata={op_type="Reshape" op_name="while/body/_1/while/mobilevit_s/tf.reshape_1/Reshape"}
[[{{function_node while_body_1010992}}{{node while/TPUReplicateMetadata}}]]
[[while/body/_1/while/Maximum_2/y/_341]]
(5) UNIMPLEMENTED: {{function_node __inference_train_function_1032011}} Dynamic input dimension to reshape that is both splitted and combined is not supported %dynamic-reshape.13585 = f3 ... [truncated]