Upload 2 files
Browse files
Notebooks/GSI_VideoRetrieval_EmbedVideos.ipynb
ADDED
@@ -0,0 +1,242 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"attachments": {},
|
5 |
+
"cell_type": "markdown",
|
6 |
+
"metadata": {},
|
7 |
+
"source": [
|
8 |
+
"# GSI Technology Video Search Demo - Embedding Videos Notebook:\n",
|
9 |
+
"\n",
|
10 |
+
"The following Notebook will include code that demonstrates the process of video embedding.<br>\n",
|
11 |
+
"It specifically focuses on embedding a single video using the [Diangle/clip4clip-webvid](https://huggingface.co/Diangle/clip4clip-webvid) model."
|
12 |
+
]
|
13 |
+
},
|
14 |
+
{
|
15 |
+
"cell_type": "code",
|
16 |
+
"execution_count": 1,
|
17 |
+
"metadata": {},
|
18 |
+
"outputs": [],
|
19 |
+
"source": [
|
20 |
+
"\"Close-up women's hands scratch\"\n",
|
21 |
+
"example = './example/34721191.mp4'"
|
22 |
+
]
|
23 |
+
},
|
24 |
+
{
|
25 |
+
"cell_type": "code",
|
26 |
+
"execution_count": 2,
|
27 |
+
"metadata": {},
|
28 |
+
"outputs": [],
|
29 |
+
"source": [
|
30 |
+
"from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize, InterpolationMode\n",
|
31 |
+
"from PIL import Image\n",
|
32 |
+
"import cv2\n",
|
33 |
+
"import numpy as np\n",
|
34 |
+
"import torch\n",
|
35 |
+
"\n",
|
36 |
+
"# Code to convert one video to few images. \n",
|
37 |
+
"def video2image(video_path, frame_rate=1.0, size=224):\n",
|
38 |
+
" def preprocess(size, n_px):\n",
|
39 |
+
" return Compose([\n",
|
40 |
+
" Resize(size, interpolation=InterpolationMode.BICUBIC), \n",
|
41 |
+
" CenterCrop(size),\n",
|
42 |
+
" lambda image: image.convert(\"RGB\"),\n",
|
43 |
+
" ToTensor(),\n",
|
44 |
+
" Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),\n",
|
45 |
+
" ])(n_px)\n",
|
46 |
+
" \n",
|
47 |
+
" cap = cv2.VideoCapture(video_path)\n",
|
48 |
+
" cap = cv2.VideoCapture(video_path, cv2.CAP_FFMPEG)\n",
|
49 |
+
" frameCount = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))\n",
|
50 |
+
" fps = int(cap.get(cv2.CAP_PROP_FPS))\n",
|
51 |
+
" if fps < 1:\n",
|
52 |
+
" images = np.zeros([3, size, size], dtype=np.float32) \n",
|
53 |
+
" print(\"ERROR: problem reading video file: \", video_path)\n",
|
54 |
+
" else:\n",
|
55 |
+
" total_duration = (frameCount + fps - 1) // fps\n",
|
56 |
+
" start_sec, end_sec = 0, total_duration\n",
|
57 |
+
" interval = fps / frame_rate\n",
|
58 |
+
" frames_idx = np.floor(np.arange(start_sec*fps, end_sec*fps, interval))\n",
|
59 |
+
" ret = True \n",
|
60 |
+
" images = np.zeros([len(frames_idx), 3, size, size], dtype=np.float32)\n",
|
61 |
+
" \n",
|
62 |
+
" for i, idx in enumerate(frames_idx):\n",
|
63 |
+
" cap.set(cv2.CAP_PROP_POS_FRAMES , idx)\n",
|
64 |
+
" ret, frame = cap.read() \n",
|
65 |
+
" if not ret: break\n",
|
66 |
+
" frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) \n",
|
67 |
+
" last_frame = i\n",
|
68 |
+
" images[i,:,:,:] = preprocess(size, Image.fromarray(frame).convert(\"RGB\"))\n",
|
69 |
+
" \n",
|
70 |
+
" images = images[:last_frame+1]\n",
|
71 |
+
" cap.release()\n",
|
72 |
+
" video_frames = torch.tensor(images)\n",
|
73 |
+
" return video_frames\n",
|
74 |
+
" \n",
|
75 |
+
"video = video2image(example)"
|
76 |
+
]
|
77 |
+
},
|
78 |
+
{
|
79 |
+
"cell_type": "code",
|
80 |
+
"execution_count": 3,
|
81 |
+
"metadata": {},
|
82 |
+
"outputs": [
|
83 |
+
{
|
84 |
+
"name": "stderr",
|
85 |
+
"output_type": "stream",
|
86 |
+
"text": [
|
87 |
+
"Some weights of the model checkpoint at Diangle/clip4clip-webvid were not used when initializing CLIPVisionModelWithProjection: ['text_model.encoder.layers.10.mlp.fc1.bias', 'text_model.encoder.layers.10.mlp.fc2.bias', 'text_model.encoder.layers.5.self_attn.out_proj.weight', 'text_model.encoder.layers.8.layer_norm1.bias', 'text_model.encoder.layers.5.layer_norm1.bias', 'text_model.encoder.layers.1.layer_norm2.bias', 'text_model.encoder.layers.10.self_attn.k_proj.weight', 'text_model.encoder.layers.7.mlp.fc1.bias', 'text_model.encoder.layers.1.layer_norm1.weight', 'text_model.encoder.layers.3.mlp.fc2.bias', 'text_model.encoder.layers.6.mlp.fc1.bias', 'text_model.encoder.layers.7.layer_norm1.bias', 'text_model.encoder.layers.8.self_attn.k_proj.weight', 'text_model.encoder.layers.9.self_attn.q_proj.bias', 'text_model.encoder.layers.3.self_attn.k_proj.weight', 'text_model.encoder.layers.6.self_attn.v_proj.weight', 'text_model.encoder.layers.0.mlp.fc2.weight', 'text_model.encoder.layers.3.self_attn.q_proj.bias', 'text_model.encoder.layers.2.mlp.fc1.bias', 'text_model.encoder.layers.3.self_attn.k_proj.bias', 'text_model.encoder.layers.4.mlp.fc1.weight', 'text_model.encoder.layers.1.self_attn.out_proj.weight', 'text_model.encoder.layers.6.layer_norm2.bias', 'logit_scale', 'text_model.encoder.layers.6.mlp.fc2.weight', 'text_model.encoder.layers.7.layer_norm1.weight', 'text_model.encoder.layers.0.layer_norm1.bias', 'text_model.encoder.layers.9.layer_norm1.bias', 'text_model.encoder.layers.5.self_attn.q_proj.bias', 'text_model.encoder.layers.2.self_attn.out_proj.bias', 'text_model.encoder.layers.9.self_attn.out_proj.bias', 'text_model.encoder.layers.7.mlp.fc2.bias', 'text_model.encoder.layers.4.mlp.fc2.weight', 'text_model.encoder.layers.8.mlp.fc1.weight', 'text_model.encoder.layers.2.self_attn.out_proj.weight', 'text_model.encoder.layers.7.self_attn.v_proj.bias', 'text_model.encoder.layers.8.self_attn.q_proj.bias', 'text_model.encoder.layers.0.layer_norm2.bias', 'text_model.encoder.layers.8.mlp.fc1.bias', 'text_model.encoder.layers.7.self_attn.out_proj.weight', 'text_model.encoder.layers.8.mlp.fc2.weight', 'text_model.encoder.layers.11.mlp.fc1.bias', 'text_model.encoder.layers.4.layer_norm1.bias', 'text_model.encoder.layers.9.self_attn.q_proj.weight', 'text_model.encoder.layers.6.self_attn.q_proj.weight', 'text_model.encoder.layers.4.self_attn.k_proj.weight', 'text_model.encoder.layers.8.self_attn.v_proj.weight', 'text_model.encoder.layers.5.mlp.fc2.bias', 'text_model.encoder.layers.4.mlp.fc2.bias', 'text_model.encoder.layers.7.self_attn.k_proj.weight', 'text_model.encoder.layers.2.layer_norm2.weight', 'text_model.final_layer_norm.bias', 'text_model.encoder.layers.10.self_attn.out_proj.bias', 'text_model.encoder.layers.7.self_attn.q_proj.bias', 'text_model.encoder.layers.11.self_attn.out_proj.weight', 'text_model.encoder.layers.10.mlp.fc1.weight', 'text_model.final_layer_norm.weight', 'text_model.encoder.layers.0.self_attn.v_proj.weight', 'text_model.encoder.layers.1.self_attn.v_proj.weight', 'text_model.encoder.layers.6.self_attn.q_proj.bias', 'text_model.encoder.layers.6.self_attn.out_proj.bias', 'text_model.encoder.layers.6.mlp.fc2.bias', 'text_model.encoder.layers.8.self_attn.out_proj.weight', 'text_model.encoder.layers.0.mlp.fc2.bias', 'text_model.encoder.layers.10.self_attn.k_proj.bias', 'text_model.encoder.layers.2.self_attn.q_proj.bias', 'text_model.encoder.layers.8.self_attn.out_proj.bias', 'text_model.encoder.layers.9.layer_norm2.weight', 'text_model.encoder.layers.11.layer_norm1.weight', 'text_model.encoder.layers.0.mlp.fc1.weight', 'text_model.encoder.layers.2.layer_norm2.bias', 'text_model.encoder.layers.4.self_attn.out_proj.weight', 'text_model.encoder.layers.8.layer_norm2.bias', 'text_model.encoder.layers.2.self_attn.v_proj.weight', 'text_model.encoder.layers.11.self_attn.q_proj.bias', 'text_model.encoder.layers.1.self_attn.q_proj.bias', 'text_model.encoder.layers.0.self_attn.out_proj.bias', 'text_model.encoder.layers.11.mlp.fc1.weight', 'text_model.encoder.layers.6.self_attn.k_proj.weight', 'text_model.encoder.layers.4.layer_norm2.bias', 'text_model.encoder.layers.5.self_attn.v_proj.weight', 'text_model.encoder.layers.6.layer_norm1.bias', 'text_model.encoder.layers.8.self_attn.k_proj.bias', 'text_model.encoder.layers.2.self_attn.v_proj.bias', 'text_model.encoder.layers.7.layer_norm2.bias', 'text_model.encoder.layers.0.self_attn.q_proj.weight', 'text_model.encoder.layers.0.mlp.fc1.bias', 'text_model.encoder.layers.11.self_attn.v_proj.weight', 'text_model.encoder.layers.9.layer_norm2.bias', 'text_model.encoder.layers.1.self_attn.q_proj.weight', 'text_model.encoder.layers.10.layer_norm1.weight', 'text_model.encoder.layers.4.layer_norm2.weight', 'text_model.encoder.layers.1.mlp.fc2.bias', 'text_model.encoder.layers.1.layer_norm1.bias', 'text_model.encoder.layers.2.self_attn.k_proj.bias', 'text_model.encoder.layers.9.self_attn.k_proj.weight', 'text_model.encoder.layers.3.self_attn.q_proj.weight', 'text_model.encoder.layers.0.layer_norm2.weight', 'text_model.encoder.layers.11.self_attn.q_proj.weight', 'text_model.encoder.layers.3.mlp.fc1.bias', 'text_model.embeddings.position_ids', 'text_model.encoder.layers.0.self_attn.v_proj.bias', 'text_model.encoder.layers.4.self_attn.v_proj.weight', 'text_model.encoder.layers.11.self_attn.v_proj.bias', 'text_model.encoder.layers.2.layer_norm1.bias', 'text_model.encoder.layers.1.mlp.fc2.weight', 'text_model.encoder.layers.2.mlp.fc2.bias', 'text_model.encoder.layers.4.mlp.fc1.bias', 'text_model.encoder.layers.5.self_attn.q_proj.weight', 'text_model.encoder.layers.1.layer_norm2.weight', 'text_model.encoder.layers.2.layer_norm1.weight', 'text_model.encoder.layers.11.layer_norm1.bias', 'text_model.encoder.layers.9.mlp.fc1.weight', 'text_model.encoder.layers.5.layer_norm1.weight', 'text_model.encoder.layers.6.layer_norm1.weight', 'text_model.encoder.layers.5.self_attn.v_proj.bias', 'text_model.encoder.layers.10.self_attn.v_proj.weight', 'text_model.encoder.layers.11.self_attn.k_proj.bias', 'text_model.encoder.layers.11.layer_norm2.bias', 'text_model.encoder.layers.9.self_attn.k_proj.bias', 'text_model.encoder.layers.9.mlp.fc1.bias', 'text_model.encoder.layers.8.self_attn.v_proj.bias', 'text_model.encoder.layers.4.self_attn.v_proj.bias', 'text_model.encoder.layers.9.layer_norm1.weight', 'text_model.encoder.layers.0.layer_norm1.weight', 'text_model.encoder.layers.8.mlp.fc2.bias', 'text_model.encoder.layers.10.self_attn.q_proj.weight', 'text_model.encoder.layers.1.self_attn.k_proj.bias', 'text_projection.weight', 'text_model.embeddings.token_embedding.weight', 'text_model.encoder.layers.4.self_attn.q_proj.bias', 'text_model.encoder.layers.5.mlp.fc2.weight', 'text_model.encoder.layers.3.layer_norm1.weight', 'text_model.encoder.layers.5.self_attn.k_proj.weight', 'text_model.encoder.layers.8.layer_norm2.weight', 'text_model.encoder.layers.5.layer_norm2.bias', 'text_model.encoder.layers.6.self_attn.v_proj.bias', 'text_model.encoder.layers.1.self_attn.v_proj.bias', 'text_model.encoder.layers.10.self_attn.out_proj.weight', 'text_model.encoder.layers.4.self_attn.q_proj.weight', 'text_model.encoder.layers.3.layer_norm1.bias', 'text_model.encoder.layers.10.self_attn.q_proj.bias', 'text_model.encoder.layers.9.mlp.fc2.bias', 'text_model.embeddings.position_embedding.weight', 'text_model.encoder.layers.3.self_attn.out_proj.weight', 'text_model.encoder.layers.5.self_attn.k_proj.bias', 'text_model.encoder.layers.3.mlp.fc1.weight', 'text_model.encoder.layers.10.layer_norm1.bias', 'text_model.encoder.layers.11.mlp.fc2.bias', 'text_model.encoder.layers.9.self_attn.v_proj.bias', 'text_model.encoder.layers.0.self_attn.q_proj.bias', 'text_model.encoder.layers.4.self_attn.k_proj.bias', 'text_model.encoder.layers.6.self_attn.k_proj.bias', 'text_model.encoder.layers.10.mlp.fc2.weight', 'text_model.encoder.layers.3.layer_norm2.weight', 'text_model.encoder.layers.0.self_attn.k_proj.bias', 'text_model.encoder.layers.5.layer_norm2.weight', 'text_model.encoder.layers.7.self_attn.out_proj.bias', 'text_model.encoder.layers.0.self_attn.out_proj.weight', 'text_model.encoder.layers.3.self_attn.v_proj.bias', 'text_model.encoder.layers.3.mlp.fc2.weight', 'text_model.encoder.layers.1.mlp.fc1.bias', 'text_model.encoder.layers.8.layer_norm1.weight', 'text_model.encoder.layers.0.self_attn.k_proj.weight', 'text_model.encoder.layers.7.layer_norm2.weight', 'text_model.encoder.layers.9.mlp.fc2.weight', 'text_model.encoder.layers.1.self_attn.k_proj.weight', 'text_model.encoder.layers.11.layer_norm2.weight', 'text_model.encoder.layers.5.mlp.fc1.weight', 'text_model.encoder.layers.11.self_attn.k_proj.weight', 'text_model.encoder.layers.4.self_attn.out_proj.bias', 'text_model.encoder.layers.1.self_attn.out_proj.bias', 'text_model.encoder.layers.3.self_attn.v_proj.weight', 'text_model.encoder.layers.6.layer_norm2.weight', 'text_model.encoder.layers.10.layer_norm2.weight', 'text_model.encoder.layers.4.layer_norm1.weight', 'text_model.encoder.layers.1.mlp.fc1.weight', 'text_model.encoder.layers.3.layer_norm2.bias', 'text_model.encoder.layers.3.self_attn.out_proj.bias', 'text_model.encoder.layers.7.self_attn.v_proj.weight', 'text_model.encoder.layers.7.mlp.fc2.weight', 'text_model.encoder.layers.9.self_attn.v_proj.weight', 'text_model.encoder.layers.2.mlp.fc1.weight', 'text_model.encoder.layers.7.mlp.fc1.weight', 'text_model.encoder.layers.5.self_attn.out_proj.bias', 'text_model.encoder.layers.11.self_attn.out_proj.bias', 'text_model.encoder.layers.2.mlp.fc2.weight', 'text_model.encoder.layers.2.self_attn.k_proj.weight', 'text_model.encoder.layers.7.self_attn.q_proj.weight', 'text_model.encoder.layers.6.self_attn.out_proj.weight', 'text_model.encoder.layers.2.self_attn.q_proj.weight', 'text_model.encoder.layers.10.self_attn.v_proj.bias', 'text_model.encoder.layers.6.mlp.fc1.weight', 'text_model.encoder.layers.10.layer_norm2.bias', 'text_model.encoder.layers.5.mlp.fc1.bias', 'text_model.encoder.layers.11.mlp.fc2.weight', 'text_model.encoder.layers.9.self_attn.out_proj.weight', 'text_model.encoder.layers.8.self_attn.q_proj.weight', 'text_model.encoder.layers.7.self_attn.k_proj.bias']\n",
|
88 |
+
"- This IS expected if you are initializing CLIPVisionModelWithProjection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
|
89 |
+
"- This IS NOT expected if you are initializing CLIPVisionModelWithProjection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
|
90 |
+
]
|
91 |
+
},
|
92 |
+
{
|
93 |
+
"name": "stdout",
|
94 |
+
"output_type": "stream",
|
95 |
+
"text": [
|
96 |
+
"tensor([-2.9570e-02, 6.0339e-03, 1.7294e-02, -1.3951e-02, 4.8329e-02,\n",
|
97 |
+
" 2.4099e-02, 3.3340e-02, 3.1769e-02, 2.1997e-03, 4.2602e-03,\n",
|
98 |
+
" -1.3887e-02, 8.2744e-03, 2.5123e-03, -2.2163e-02, -4.1139e-02,\n",
|
99 |
+
" -1.2101e-02, -6.1914e-02, 6.7091e-03, 4.2834e-02, -2.2604e-02,\n",
|
100 |
+
" -2.7443e-02, 1.0600e-02, 2.9430e-03, 3.2580e-02, -1.3577e-02,\n",
|
101 |
+
" 7.8084e-03, 1.2397e-02, -5.3404e-03, 1.4736e-02, -2.4564e-02,\n",
|
102 |
+
" -5.4057e-02, 3.9507e-02, 1.2754e-02, 4.6864e-04, 7.4087e-03,\n",
|
103 |
+
" 3.8710e-03, 7.9482e-03, 1.3444e-02, -1.7326e-02, -1.2486e-01,\n",
|
104 |
+
" -8.4992e-02, -3.9097e-02, -2.1903e-02, -7.1480e-03, -2.7220e-03,\n",
|
105 |
+
" 4.1397e-03, 1.7315e-02, 4.4724e-02, 9.1722e-04, 3.1429e-02,\n",
|
106 |
+
" 3.8212e-02, -2.1133e-02, 2.4437e-03, -1.4371e-03, -2.9859e-03,\n",
|
107 |
+
" 7.8939e-04, 2.4093e-02, -2.2199e-02, -3.9110e-02, 1.7673e-02,\n",
|
108 |
+
" 1.1360e-01, 3.3466e-03, -1.9643e-02, 1.7798e-03, 1.5112e-02,\n",
|
109 |
+
" -6.2003e-03, -2.0564e-02, 6.4936e-02, 6.6286e-02, -2.0585e-02,\n",
|
110 |
+
" 2.0740e-02, 1.0476e-02, -5.9948e-03, -2.4672e-02, 2.3725e-02,\n",
|
111 |
+
" -4.6442e-03, 1.8887e-02, 3.7517e-02, 3.1605e-02, -3.7756e-03,\n",
|
112 |
+
" 2.7584e-02, 5.7234e-03, 3.4368e-02, 1.4564e-02, 2.6392e-02,\n",
|
113 |
+
" -1.9975e-02, 1.2648e-01, -5.3093e-03, 7.3013e-02, 4.8827e-03,\n",
|
114 |
+
" -2.8492e-02, -4.9734e-02, -6.6967e-01, 1.2463e-02, 2.4013e-02,\n",
|
115 |
+
" 1.3702e-02, 2.9382e-02, 1.4373e-02, -2.1994e-02, 3.6824e-03,\n",
|
116 |
+
" 2.9366e-02, -2.1474e-03, 1.7371e-02, -6.1958e-02, -4.6649e-02,\n",
|
117 |
+
" -4.3063e-03, 1.0081e-01, -3.1598e-02, 9.4211e-03, -9.7909e-03,\n",
|
118 |
+
" 4.4678e-02, -4.8716e-03, 1.8896e-02, 9.5822e-03, -2.3881e-02,\n",
|
119 |
+
" -9.0785e-03, 5.4653e-03, 3.0017e-02, -3.0415e-02, -1.3150e-03,\n",
|
120 |
+
" 2.9047e-02, 3.2315e-02, -1.0728e-02, 4.7503e-02, -4.0033e-02,\n",
|
121 |
+
" 3.4482e-02, 6.2684e-02, 3.0337e-02, 5.0680e-02, -8.6022e-03,\n",
|
122 |
+
" 1.5261e-02, 3.7766e-02, -2.4730e-02, 8.6131e-02, 4.5388e-02,\n",
|
123 |
+
" 5.4677e-02, 3.9401e-02, 4.4164e-02, -5.2270e-02, -8.8473e-03,\n",
|
124 |
+
" 8.1178e-03, -1.0574e-02, -7.6409e-05, -8.3209e-03, -8.1179e-04,\n",
|
125 |
+
" 3.2574e-02, -1.4150e-02, -4.0937e-02, 1.0180e-02, 1.3868e-03,\n",
|
126 |
+
" 3.4978e-02, -1.1991e-02, -2.1560e-02, 2.0833e-02, 3.8494e-02,\n",
|
127 |
+
" 1.4916e-02, -1.5102e-02, -1.0009e-02, -9.6670e-03, 3.6516e-03,\n",
|
128 |
+
" 2.6473e-02, -9.1190e-03, -1.9326e-02, 3.2072e-02, -2.9562e-02,\n",
|
129 |
+
" -4.1949e-02, -9.4430e-03, 2.7654e-02, 3.1868e-02, 2.6336e-03,\n",
|
130 |
+
" -1.6622e-02, -3.4676e-02, -3.4540e-02, 8.5971e-03, -9.4823e-03,\n",
|
131 |
+
" -3.6754e-02, 4.9925e-02, 9.8040e-04, -6.7678e-02, 5.0645e-03,\n",
|
132 |
+
" -7.5227e-03, 1.2880e-02, 5.5055e-02, -5.1705e-02, -6.1548e-02,\n",
|
133 |
+
" 1.4440e-03, -6.8204e-03, -1.4279e-02, -2.8179e-02, -2.2386e-02,\n",
|
134 |
+
" 5.2374e-02, -3.4718e-02, 5.3560e-03, -6.3553e-02, 8.3361e-02,\n",
|
135 |
+
" -2.7192e-02, 4.2078e-02, 3.2605e-03, -5.6035e-02, -8.2745e-03,\n",
|
136 |
+
" -2.8813e-02, 4.3161e-02, -5.0922e-02, 3.0529e-02, 2.0102e-02,\n",
|
137 |
+
" 2.9533e-02, -7.8186e-03, -3.0819e-02, -2.1356e-02, -2.7967e-02,\n",
|
138 |
+
" 2.4877e-02, 2.3300e-02, 2.8305e-02, 2.9761e-02, 1.2363e-02,\n",
|
139 |
+
" -1.4158e-02, -1.1000e-02, 2.3479e-02, 4.8863e-02, -1.3325e-02,\n",
|
140 |
+
" 1.2415e-02, -1.0494e-02, -5.3160e-04, -1.3253e-02, -2.4968e-03,\n",
|
141 |
+
" 2.0370e-02, -5.9943e-03, -9.5419e-03, 5.9531e-03, -8.3129e-03,\n",
|
142 |
+
" -4.0607e-03, 6.1272e-03, -2.9724e-02, -1.8579e-02, 1.2740e-02,\n",
|
143 |
+
" -2.6391e-02, 4.1079e-03, -4.0331e-03, 3.4990e-02, -3.4697e-04,\n",
|
144 |
+
" -9.6936e-03, -2.2701e-02, 3.2625e-02, 1.1973e-02, -3.9408e-02,\n",
|
145 |
+
" -6.4848e-02, 4.3097e-02, 2.6910e-02, -3.9942e-02, 3.4112e-02,\n",
|
146 |
+
" -7.8409e-03, -4.3240e-02, -1.6996e-02, 3.8101e-02, -3.8530e-02,\n",
|
147 |
+
" 2.1452e-04, 3.7173e-02, 2.3474e-02, 1.9435e-03, -2.1596e-02,\n",
|
148 |
+
" 1.2855e-02, 4.8854e-03, 2.1395e-02, -2.4349e-02, 7.3487e-03,\n",
|
149 |
+
" -2.7641e-02, -1.5773e-02, 1.1367e-02, 8.7802e-03, 2.3783e-02,\n",
|
150 |
+
" 3.3420e-02, 3.4498e-02, 2.2979e-02, -1.2473e-02, 3.1100e-02,\n",
|
151 |
+
" 6.0752e-02, -2.5795e-02, 1.7830e-02, -1.3168e-02, 8.0613e-04,\n",
|
152 |
+
" 1.3292e-02, 8.1109e-03, 2.1875e-03, -1.0863e-02, 3.8718e-02,\n",
|
153 |
+
" 4.5967e-02, -1.2454e-01, 2.6564e-02, -4.4082e-04, 1.8394e-02,\n",
|
154 |
+
" 2.9872e-02, 6.4751e-03, 5.4129e-03, 2.0823e-02, -4.9624e-02,\n",
|
155 |
+
" -2.3234e-02, -5.7144e-02, -1.3117e-02, -5.3304e-02, -1.9084e-02,\n",
|
156 |
+
" -1.9121e-02, 2.5556e-04, -3.9970e-02, -3.3640e-02, 1.0532e-02,\n",
|
157 |
+
" 5.7862e-02, -4.0414e-02, 6.6390e-03, 1.6265e-03, 1.0555e-02,\n",
|
158 |
+
" -5.1818e-03, -3.9941e-02, 8.6119e-02, 2.5038e-02, 1.1136e-02,\n",
|
159 |
+
" -8.5421e-03, -2.0004e-02, 3.0798e-02, -4.8180e-03, -1.1030e-02,\n",
|
160 |
+
" 7.1489e-03, 7.0376e-02, -4.2558e-02, -5.4193e-02, 6.0990e-03,\n",
|
161 |
+
" 1.5232e-02, 1.3667e-02, -1.5016e-02, -1.0382e-03, -6.4072e-03,\n",
|
162 |
+
" 2.3970e-03, 3.7884e-02, -1.7684e-02, 2.0192e-02, -2.1400e-02,\n",
|
163 |
+
" 1.6529e-02, 1.8982e-02, 1.6748e-02, -2.0919e-02, 1.2904e-02,\n",
|
164 |
+
" -1.5105e-02, -1.7961e-02, 2.2824e-03, 9.0103e-04, 1.3905e-02,\n",
|
165 |
+
" -5.2162e-02, 5.7747e-03, 6.7262e-03, 6.3685e-03, -1.2071e-02,\n",
|
166 |
+
" -2.7873e-02, -1.4171e-04, -4.8872e-02, -8.9744e-03, -1.0448e-02,\n",
|
167 |
+
" 4.9146e-02, -2.0365e-02, -6.8874e-02, 1.3715e-02, -2.8159e-02,\n",
|
168 |
+
" 5.1973e-03, -4.1494e-02, 1.7353e-02, -1.4510e-02, -4.5331e-03,\n",
|
169 |
+
" 1.0267e-02, -2.9127e-02, 1.0169e-02, -5.0776e-03, -2.0463e-02,\n",
|
170 |
+
" 1.6880e-02, 2.4789e-02, -3.2186e-02, -1.5043e-02, -9.5236e-03,\n",
|
171 |
+
" -1.8453e-02, 1.9968e-01, -3.1110e-02, -3.4481e-02, -5.3706e-03,\n",
|
172 |
+
" -2.3295e-02, -6.6525e-02, 1.5241e-02, -5.3700e-02, -1.3558e-02,\n",
|
173 |
+
" -7.4800e-02, 4.6305e-02, 4.3405e-03, 1.0513e-02, -1.4961e-02,\n",
|
174 |
+
" 1.2347e-01, -4.1887e-02, -2.9692e-02, -2.0832e-02, 2.5459e-03,\n",
|
175 |
+
" 1.5311e-02, -1.3357e-02, 1.3205e-02, 2.8943e-02, 4.9173e-02,\n",
|
176 |
+
" 3.3758e-02, 1.1087e-02, 4.2151e-02, 6.3205e-04, -4.3288e-02,\n",
|
177 |
+
" 2.3333e-02, 1.5167e-02, -1.0237e-02, -7.9236e-02, 4.3594e-03,\n",
|
178 |
+
" 3.1445e-02, 4.2794e-03, -9.3492e-03, -3.5418e-02, -1.9242e-02,\n",
|
179 |
+
" -3.0336e-02, 7.7880e-03, 6.6255e-02, -7.5213e-03, 2.5932e-02,\n",
|
180 |
+
" -1.7802e-02, 1.8590e-03, 5.3834e-03, 9.6787e-02, 2.8787e-02,\n",
|
181 |
+
" 9.1017e-04, -1.8586e-02, 2.2730e-02, -9.7814e-02, 4.2616e-02,\n",
|
182 |
+
" 4.0229e-02, -8.9988e-03, -2.0952e-02, 7.7816e-03, -4.0449e-04,\n",
|
183 |
+
" -1.3639e-02, -1.7206e-03, -9.1304e-03, 4.3670e-03, 1.9919e-02,\n",
|
184 |
+
" -2.0095e-02, -2.6256e-03, 3.0235e-02, 3.7728e-03, 6.3254e-04,\n",
|
185 |
+
" -6.9728e-02, 2.5881e-03, 1.0343e-02, 3.3831e-02, 2.2356e-03,\n",
|
186 |
+
" -2.7363e-02, 3.5232e-02, 5.3659e-02, -7.8222e-03, -2.0881e-03,\n",
|
187 |
+
" 2.2187e-02, 2.0626e-02, 3.6413e-02, -4.4460e-03, 4.6213e-02,\n",
|
188 |
+
" -1.4652e-03, 2.1768e-02, 3.3055e-03, -2.3867e-02, -2.7972e-02,\n",
|
189 |
+
" -6.7086e-02, 2.4510e-02, 4.0885e-02, -1.6748e-03, 1.2575e-02,\n",
|
190 |
+
" -2.0675e-04, -1.1889e-02, 4.2555e-03, -2.6686e-02, -9.5006e-03,\n",
|
191 |
+
" -1.3144e-02, 3.0939e-02, -1.9938e-02, 4.2527e-02, -1.4343e-02,\n",
|
192 |
+
" 5.5876e-03, 2.4495e-02, 3.9814e-03, 2.8102e-02, 4.3181e-02,\n",
|
193 |
+
" -1.7406e-02, -4.2736e-02, -8.1578e-03, -5.3989e-03, 2.9429e-03,\n",
|
194 |
+
" 4.3196e-02, -2.0857e-02, -3.0203e-02, -4.0288e-03, -4.4894e-02,\n",
|
195 |
+
" 2.7039e-02, 3.5724e-02, -1.4012e-02, -2.3949e-03, 1.4861e-02,\n",
|
196 |
+
" 3.1610e-02, 4.8555e-02, 1.8550e-02, 1.2663e-02, -6.1358e-03,\n",
|
197 |
+
" -4.1771e-02, 2.8252e-02, -1.1711e-02, -4.0601e-03, -2.9267e-02,\n",
|
198 |
+
" -3.0001e-02, 1.6215e-02], grad_fn=<DivBackward0>)\n"
|
199 |
+
]
|
200 |
+
}
|
201 |
+
],
|
202 |
+
"source": [
|
203 |
+
"from transformers import CLIPVisionModelWithProjection\n",
|
204 |
+
"\n",
|
205 |
+
"model = CLIPVisionModelWithProjection.from_pretrained(\"Diangle/clip4clip-webvid\")\n",
|
206 |
+
"model = model.eval()\n",
|
207 |
+
"visual_output = model(video)\n",
|
208 |
+
"\n",
|
209 |
+
"# Normalizing the embeddings and calculating mean between all embeddings. \n",
|
210 |
+
"visual_output = visual_output[\"image_embeds\"]\n",
|
211 |
+
"visual_output = visual_output / visual_output.norm(dim=-1, keepdim=True)\n",
|
212 |
+
"visual_output = torch.mean(visual_output, dim=0)\n",
|
213 |
+
"visual_output = visual_output / visual_output.norm(dim=-1, keepdim=True)\n",
|
214 |
+
"print(visual_output)\n",
|
215 |
+
"\n",
|
216 |
+
" "
|
217 |
+
]
|
218 |
+
}
|
219 |
+
],
|
220 |
+
"metadata": {
|
221 |
+
"kernelspec": {
|
222 |
+
"display_name": "Python 3",
|
223 |
+
"language": "python",
|
224 |
+
"name": "python3"
|
225 |
+
},
|
226 |
+
"language_info": {
|
227 |
+
"codemirror_mode": {
|
228 |
+
"name": "ipython",
|
229 |
+
"version": 3
|
230 |
+
},
|
231 |
+
"file_extension": ".py",
|
232 |
+
"mimetype": "text/x-python",
|
233 |
+
"name": "python",
|
234 |
+
"nbconvert_exporter": "python",
|
235 |
+
"pygments_lexer": "ipython3",
|
236 |
+
"version": "3.10.9"
|
237 |
+
},
|
238 |
+
"orig_nbformat": 4
|
239 |
+
},
|
240 |
+
"nbformat": 4,
|
241 |
+
"nbformat_minor": 2
|
242 |
+
}
|
Notebooks/example/34721191.mp4
ADDED
Binary file (875 kB). View file
|
|