Instructions to use mispeech/ced-tiny with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use mispeech/ced-tiny with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("audio-classification", model="mispeech/ced-tiny", trust_remote_code=True)# Load model directly from transformers import AutoModelForAudioClassification model = AutoModelForAudioClassification.from_pretrained("mispeech/ced-tiny", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
| # coding=utf-8 | |
| # Copyright 2023 Xiaomi Corporation and The HuggingFace Inc. team. All rights reserved. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| """ CED model configuration""" | |
| import csv | |
| from transformers import PretrainedConfig | |
| from transformers.utils import logging | |
| from transformers.utils.hub import cached_file | |
| logger = logging.get_logger(__name__) | |
| CED_PRETRAINED_CONFIG_ARCHIVE_MAP = { | |
| "mispeech/ced-tiny": "https://huggingface.co/mispeech/ced-tiny/resolve/main/config.json", | |
| } | |
| class CedConfig(PretrainedConfig): | |
| model_type = "ced" | |
| r""" | |
| Configuration class for the CED model. | |
| Args: | |
| name (str, optional, *optional*): | |
| Name of the pre-defined configuration. Can be "ced-tiny", "ced-mini", "ced-small" or "ced-base". | |
| attn_drop_rate (float, *optional*, defaults to 0.0): | |
| Dropout probability for attention weights. Default to 0.0. | |
| depth (int, *optional*, defaults to 12): Number of transformer layers. Default to 12. | |
| drop_path_rate (float, *optional*, defaults to 0.0): Drop path is taken from timm. Default to 0.0. | |
| drop_rate (float, *optional*, defaults to 0.0): | |
| Dropout probability for input embeddings. Default to 0.0. | |
| embed_dim (int, *optional*, defaults to 768): | |
| Dimensionality of the audio patch embeddings. Default to 768. | |
| eval_avg (str, *optional*, defaults to `"mean"`): | |
| Type of pooling to use for evaluation. Can be "mean", "token", "dm" or "logit". Default to "mean". | |
| mlp_ratio (float, *optional*, defaults to 4.0): | |
| Ratio of hidden size in the feedforward layer to the embedding size. Default to 4.0. | |
| num_heads (int, *optional*, defaults to 12): Number of attention heads. Default to 12. | |
| outputdim (int, *optional*, defaults to 527): Dimensionality of the output. Default to 527. | |
| patch_size (int, *optional*, defaults to 16): Size of the patches. Default to 16. | |
| patch_stride (int, *optional*, defaults to 16): Stride of the patches. Default to 16. | |
| pooling (str, *optional*, defaults to `"mean"`): | |
| Type of pooling to use for the output. Can be "mean", "token", "dm" or "logit". Default to "mean". | |
| qkv_bias (bool, *optional*, defaults to `True`): | |
| Whether to include bias terms in the query, key and value projections. Default to True. | |
| target_length (int, *optional*, defaults to 1012): Frames of an audio chunk. Default to 1012. | |
| """ | |
| def __init__( | |
| self, | |
| name=None, | |
| attn_drop_rate=0.0, | |
| depth=12, | |
| drop_path_rate=0.0, | |
| drop_rate=0.0, | |
| embed_dim=768, | |
| eval_avg="mean", | |
| mlp_ratio=4.0, | |
| num_heads=12, | |
| outputdim=527, | |
| patch_size=16, | |
| patch_stride=16, | |
| pooling="mean", | |
| qkv_bias=True, | |
| target_length=1012, | |
| **kwargs, | |
| ): | |
| r""" | |
| TODO: Add docstring | |
| """ | |
| super().__init__(**kwargs) | |
| if name == "ced-tiny": | |
| embed_dim = 192 | |
| num_heads = 3 | |
| elif name == "ced-mini": | |
| embed_dim = 256 | |
| num_heads = 4 | |
| elif name == "ced-small": | |
| embed_dim = 384 | |
| num_heads = 6 | |
| elif name == "ced-base": | |
| embed_dim = 768 | |
| num_heads = 12 | |
| else: | |
| logger.info("No model name specified for CedConfig, use default settings.") | |
| assert pooling in ("mean", "token", "dm", "logit") | |
| self.name = name | |
| self.attn_drop_rate = attn_drop_rate | |
| self.center = kwargs.get("center", True) | |
| self.depth = depth | |
| self.drop_path_rate = drop_path_rate | |
| self.drop_rate = drop_rate | |
| self.embed_dim = embed_dim | |
| self.eval_avg = eval_avg | |
| self.f_max = kwargs.get("f_max", 8000) | |
| self.f_min = kwargs.get("f_min", 0) | |
| self.hop_size = kwargs.get("hop_size", 160) | |
| self.mlp_ratio = mlp_ratio | |
| self.n_fft = kwargs.get("n_fft", 512) | |
| self.n_mels = kwargs.get("n_mels", 64) | |
| self.n_mels = kwargs.get("n_mels", 64) | |
| self.num_heads = num_heads | |
| self.outputdim = outputdim | |
| self.pad_last = kwargs.get("pad_last", True) | |
| self.patch_size = patch_size | |
| self.patch_stride = patch_stride | |
| self.pooling = pooling | |
| self.qkv_bias = qkv_bias | |
| self.target_length = target_length | |
| self.win_size = kwargs.get("win_size", 512) | |
| self.loss = "BCE" | |
| if self.outputdim == 527: | |
| with open(cached_file("topel/ConvNeXt-Tiny-AT", "class_labels_indices.csv"), "r") as f: | |
| reader = csv.reader(f) | |
| next(reader) # skip header | |
| self.id2label = {} | |
| for row in reader: | |
| idx = int(row[0]) | |
| label = row[2] | |
| if label not in self.id2label.values(): | |
| self.id2label[idx] = label | |
| self.label2id = {v: k for k, v in self.id2label.items()} | |
| else: | |
| self.id2label = None | |
| self.label2id = None | |