Upload README.md with huggingface_hub
Browse files
README.md
CHANGED
|
@@ -121,6 +121,28 @@ similarities = torch.einsum("btd,bd->bt", audio_embeds, text_embeds)
|
|
| 121 |
# similarities shape: [batch_size, num_frames]
|
| 122 |
```
|
| 123 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
## Citation
|
| 125 |
|
| 126 |
```bibtex
|
|
|
|
| 121 |
# similarities shape: [batch_size, num_frames]
|
| 122 |
```
|
| 123 |
|
| 124 |
+
|
| 125 |
+
### Usage with 🤗 Transformers
|
| 126 |
+
|
| 127 |
+
```python
|
| 128 |
+
model = PeAudioFrameLevelModel.from_pretrained("facebook/pe-a-frame-large")
|
| 129 |
+
processor = PeAudioProcessor.from_pretrained("facebook/pe-a-frame-large")
|
| 130 |
+
|
| 131 |
+
inputs = transform(audio=[audio_file], text=descriptions, return_tensors="pt").to(device)
|
| 132 |
+
|
| 133 |
+
with torch.inference_mode():
|
| 134 |
+
outputs = model(**inputs)
|
| 135 |
+
|
| 136 |
+
# Access embeddings
|
| 137 |
+
audio_embeds = outputs.audio_embeds # Shape: [batch_size, num_frames, embed_dim]
|
| 138 |
+
text_embeds = outputs.text_audio_embeds # Shape: [batch_size, embed_dim]
|
| 139 |
+
|
| 140 |
+
# Compute similarity between audio frames and text
|
| 141 |
+
# audio_embeds is frame-level, so you can see which frames match the description
|
| 142 |
+
similarities = torch.einsum("btd,bd->bt", audio_embeds, text_embeds)
|
| 143 |
+
# similarities shape: [batch_size, num_frames]
|
| 144 |
+
```
|
| 145 |
+
|
| 146 |
## Citation
|
| 147 |
|
| 148 |
```bibtex
|