ankur2402 commited on
Commit
3abffff
1 Parent(s): 45ff1cb

Upload 12 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Video005-Scene-043.mp4 filter=lfs diff=lfs merge=lfs -text
Final_ISRO_DenseNet201_Epoch50.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6b8145b4fe5a9d3ce9d54edfa35bcce6f27fb85a6581488fac068d2414bd5c9
3
+ size 140107848
Testing1.jpg ADDED
Train_Label.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2883f13fef0601ccf5e6624638dce5b1342685b41f7c94f34d5b2f30acb2eed
3
+ size 198919
Video001-Scene-001.mp4 ADDED
Binary file (413 kB). View file
 
Video003-Scene-044.mp4 ADDED
Binary file (92.8 kB). View file
 
Video005-Scene-043.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55751881488b7d6c4f427f3394a881d3041ddd894d43fd4e544d01b784217a27
3
+ size 1386587
Video015-Scene-074.mp4 ADDED
Binary file (444 kB). View file
 
densenet.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c721c4c5ff8eb454517432bb9ffc1397bb83c5d72f804de08c21c487cd8afb0a
3
+ size 222191800
image_model_transfer.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc07714490afaa42f1b09170e0d27289d05bd6fb56666ca50676a26818da70a8
3
+ size 75231432
main.py ADDED
@@ -0,0 +1,378 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import numpy as np
3
+ import cv2
4
+ import tensorflow as tf
5
+ from PIL import Image
6
+ from keras.models import load_model
7
+ from sklearn.preprocessing import LabelEncoder
8
+ import pickle
9
+ from keras_preprocessing.sequence import pad_sequences
10
+ from keras.preprocessing.text import Tokenizer
11
+ from sklearn.preprocessing import LabelEncoder
12
+ from PIL import Image
13
+ # from google.colab.patches import cv2_imshow
14
+
15
+ def label_smoothing(y_true,y_pred):
16
+
17
+ return tf.keras.losses.binary_crossentropy(y_true,y_pred,label_smoothing=0.1)
18
+ def sparse_cross_entropy(y_true, y_pred):
19
+ loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_true,
20
+ logits=y_pred)
21
+ loss_mean = tf.reduce_mean(loss)
22
+ return loss_mean
23
+ model1 = load_model('densenet.h5',custom_objects={'label_smoothing': label_smoothing})
24
+ image_model_transfer=load_model("image_model_transfer.h5")
25
+ decoder_model=load_model("Final_ISRO_DenseNet201_Epoch50.h5",custom_objects={'sparse_cross_entropy': sparse_cross_entropy})
26
+
27
+ class TokenizerWrap(Tokenizer):
28
+ """Wrap the Tokenizer-class from Keras with more functionality."""
29
+
30
+ def _init_(self, texts, num_words=None):
31
+ """
32
+ :param texts: List of strings with the data-set.
33
+ :param num_words: Max number of words to use.
34
+ """
35
+
36
+ Tokenizer._init_(self, num_words=num_words)
37
+
38
+ # Create the vocabulary from the texts.
39
+ self.fit_on_texts(texts)
40
+
41
+ # Create inverse lookup from integer-tokens to words.
42
+ # word_index is a dictionary. its values are tokens and the keys are words
43
+ # opposite to index_to_word
44
+ self.index_to_word = dict(zip(self.word_index.values(),
45
+ self.word_index.keys()))
46
+
47
+ def token_to_word(self, token):
48
+ """Lookup a single word from an integer-token."""
49
+ word = " " if token == 0 else self.index_to_word[token]
50
+ return word
51
+
52
+ def tokens_to_string(self, tokens):
53
+ """Convert a list of integer-tokens to a string."""
54
+ # Create a list of the individual words.
55
+ words = [self.index_to_word[token]
56
+ for token in tokens
57
+ if token != 0]
58
+
59
+ # Concatenate the words to a single string
60
+ # with space between all the words.
61
+ text = " ".join(words)
62
+
63
+ return text
64
+
65
+ def captions_to_tokens(self, captions_listlist):
66
+ """
67
+ Convert a list-of-list with text-captions to
68
+ a list-of-list of integer-tokens.
69
+ """
70
+
71
+ # Note that text_to_sequences() takes a list of texts.
72
+ tokens = [self.texts_to_sequences(captions_list)
73
+ for captions_list in captions_listlist]
74
+
75
+ return tokens
76
+ with open('Train_Label.pickle', 'rb') as efile:
77
+ labels=pickle.load(efile)
78
+ with open('tokenizer.pkl', 'rb') as efile:
79
+ tokenizer=pickle.load(efile)
80
+
81
+ le=LabelEncoder()
82
+ labels=le.fit_transform(labels)
83
+
84
+ def framing(video):#defining a small function named"framing" with a parameter "i" that's supposed to be provided for reading the video
85
+ fr = []#creating an empty list named fr
86
+ fr_pre=[]#creating an empty list named fr_pre
87
+ cap = cv2.VideoCapture(video)#reading the video file
88
+ while (cap.isOpened()):#This command builds a loop to check if the data is still being read from the video
89
+ ret,frame = cap.read()#reading the data tunnel,gives two output where one tells about presence of frames(here it's ret) & the other speaks frame data(here it's frame)
90
+ if ret == True:#checking for presence of frames
91
+ # cv2_imshow(frame)#displaying the frames
92
+ grayed = cv2.cvtColor(frame,cv2.COLOR_BGR2GRAY)#Converting the frames to Grayscale from BGR
93
+ canned = cv2.Canny(grayed,320,320)#For extrating edges we use Canny Edge detection method
94
+ fr.append(frame)#Appending the read frame
95
+ fr_pre.append(canned)#Appending the edge extracted frames
96
+ # cv2_imshow(grayed)#Displaying the original frames
97
+ # cv2_imshow(canned)#Displaying the edge detected frames
98
+ k = cv2.waitKey(10) & 0XFF#this is an arrangement for displaying the video where the secs for which each frame needs to be displayed in given in the paranthesis
99
+ if k == ord('q'):#pressing 'q' key will close the video
100
+ break
101
+ else:
102
+ break
103
+ cap.release()#Here we release the resoures
104
+ cv2.destroyAllWindows()#Here we delete all the windows that were created during the program
105
+ return fr_pre,fr
106
+
107
+ def difference_of_frames(frames):
108
+ diff = []#creatin a list variable
109
+ for i in range(0,len(frames)-1):#defining the range
110
+ diff.append(cv2.absdiff(frames[i],frames[i+1]))#appending the diff between frames to the list variable so we're supposed to get only the difference between frames
111
+ return diff
112
+
113
+ def cal_threshold(diff):
114
+ mn = np.mean(diff)#This gives mean
115
+ st_d = np.std(diff)#This gives standard deviation
116
+ a = 4#Setting a random value we can modify it to any value
117
+ ts = mn + (a * st_d)#defining the standard threshold value for the project/global threshold value
118
+ return ts
119
+
120
+ def imp_frames(diff, ts, ogframes):
121
+ a_fr = []#Creating an empty list
122
+ for i in range(len(diff)):#Defining the for loop to be looped over all the frames obtained after finding the frames resulted from subtracting
123
+ mn = np.mean(diff[i])#Calculating the mean for each frame
124
+ st_d = np.std(diff[i])#Calculating the standard deviation for each frame
125
+ fr_ts = mn + (4*st_d)#Finding the threshold values for each frame/image
126
+ a_fr.append([i,fr_ts])#Appending the frame number & the threshold values
127
+ imp_fr = []#Creating an empty list
128
+ for i,ac_tr in(a_fr):#Defining the loop on the list obtained from above code
129
+ if ac_tr >= ts:#Comapring the threshold values to the standard threshold/global threshold values
130
+ imp_fr.append([i,ac_tr])#Appending the list with the imp frames based on their index & the values
131
+ key_fr = []#Creating an empty list
132
+ for i,_ in imp_fr:#Defining the loop over the list obtained from above code
133
+ key_fr.append(ogframes[i])#This extracts the frames based on the index of frames
134
+ return key_fr
135
+
136
+ def final_image(video):
137
+ frames,ogframes = framing(video)#calling function framing & then extracting the images
138
+ diff=difference_of_frames(frames)
139
+ ts=cal_threshold(diff)
140
+ key_fr=imp_frames(diff, ts, ogframes)
141
+ frame_no=key_fr[int(len(key_fr)/2)] #this is a frame
142
+ cv2.imwrite("Testing1.jpg",frame_no)
143
+ return "Testing1.jpg"
144
+ cv2.destroyAllWindows()
145
+
146
+ def image_test(image_path):
147
+ image=Image.open(image_path)
148
+ image = image.resize((224,224))
149
+ image = np.array(image)
150
+ image= np.expand_dims(image, axis=0)
151
+ return image
152
+
153
+ def largest_indices(ary, n):
154
+ flat = ary.flatten()
155
+ indices = np.argpartition(flat, -n)[-n:]
156
+ indices = indices[np.argsort(-flat[indices])]
157
+ return indices
158
+
159
+ mark_start = 'ssss'
160
+ mark_end = ' eeee'
161
+
162
+ token_start = tokenizer.word_index[mark_start.strip()]
163
+ token_end = tokenizer.word_index[mark_end.strip()]
164
+
165
+ def load_image(path, size=None):
166
+ """
167
+ Load the image from the given file-path and resize it
168
+ to the given size if not None.
169
+ """
170
+
171
+ # Load the image using PIL.
172
+ img = Image.open(path)
173
+
174
+ # Resize image if desired.
175
+ if not size is None:
176
+ img = img.resize(size=size, resample=Image.LANCZOS)
177
+
178
+ img = np.array(img)
179
+ img = img / 255.0
180
+
181
+ # Convert 2-dim gray-scale array to 3-dim RGB array.
182
+ if (len(img.shape) == 2):
183
+ img = np.repeat(img[:, :, np.newaxis], 3, axis=2)
184
+ return img
185
+
186
+ def greedy_search(image_path, max_tokens=30):
187
+ """
188
+ Generate a caption for the image in the given path.
189
+ The caption is limited to the given number of tokens (words).
190
+ """
191
+ # ---------------------------ENCODE IMAGE--------------------------------
192
+ # Load and resize the image.
193
+ image = load_image(image_path, size=(224,224))
194
+
195
+ # Expand the 3-dim numpy array to 4-dim
196
+ # because the image-model expects a whole batch as input,
197
+ # so we give it a batch with just one image.
198
+ image_batch = np.expand_dims(image, axis=0)
199
+
200
+ # Process the image with the pre-trained image-model
201
+ # to get the transfer-values.
202
+ transfer_values = image_model_transfer.predict(image_batch)
203
+
204
+ # -------------------------------------------------------------------
205
+
206
+
207
+ # Pre-allocate the 2-dim array used as input to the decoder.
208
+ # This holds just a single sequence of integer-tokens,
209
+ # but the decoder-model expects a batch of sequences.
210
+ shape = (1, max_tokens)
211
+ decoder_input_data = np.zeros(shape=shape, dtype=int)
212
+
213
+ # The first input-token is the special start-token for 'ssss '.
214
+ token_int = token_start #1
215
+
216
+ # Initialize an empty output-text.
217
+ output_text = ''
218
+
219
+ # Initialize the number of tokens we have processed.
220
+ count_tokens = 0
221
+
222
+ # While we haven't sampled the special end-token for ' eeee'
223
+ # and we haven't processed the max number of tokens.
224
+ while token_int != token_end and count_tokens < max_tokens:
225
+ # Update the input-sequence to the decoder
226
+ # with the last token that was sampled.
227
+ # In the first iteration this will set the
228
+ # first element to the start-token.
229
+ decoder_input_data[0, count_tokens] = token_int
230
+
231
+ # Wrap the input-data in a dict for clarity and safety,
232
+ # so we are sure we input the data in the right order.
233
+ x_data = \
234
+ {
235
+ 'transfer_values_input': transfer_values,
236
+ 'decoder_input': decoder_input_data
237
+ }
238
+
239
+ # Note that we input the entire sequence of tokens
240
+ # to the decoder. This wastes a lot of computation
241
+ # because we are only interested in the last input
242
+ # and output. We could modify the code to return
243
+ # the GRU-states when calling predict() and then
244
+ # feeding these GRU-states as well the next time
245
+ # we call predict(), but it would make the code
246
+ # much more complicated.
247
+
248
+ # Input this data to the decoder and get the predicted output.
249
+ decoder_output = decoder_model.predict(x_data)
250
+ # print(decoder_output.shape) (1,30,15000) for every iteration
251
+
252
+ # Get the last predicted token as a one-hot encoded array.
253
+ # Note that this is not limited by softmax, but we just
254
+ # need the index of the largest element so it doesn't matter.
255
+ token_onehot = decoder_output[0, count_tokens, :]
256
+ # print(token_onehot.shape) (15000, ) for every iteration
257
+ # Convert to an integer-token.
258
+ token_int = np.argmax(token_onehot)
259
+ # print(token_int) #the token of a word with the highest score
260
+
261
+ # Lookup the word corresponding to this integer-token.
262
+ sampled_word = tokenizer.token_to_word(token_int)
263
+ # print(sampled_word)
264
+
265
+ # Append the word to the output-text.
266
+ output_text += " " + sampled_word
267
+
268
+ # Increment the token-counter.
269
+ count_tokens += 1
270
+
271
+ # This is the sequence of tokens output by the decoder.
272
+ output_tokens = decoder_input_data[0]
273
+ # print(output_tokens)
274
+ # Plot the image.
275
+ # plt.imshow(image)
276
+ # plt.show()
277
+
278
+ predicted_caption=output_text.split()
279
+ del (predicted_caption[-1])
280
+ output_text = " "
281
+ output_text = output_text.join(predicted_caption)
282
+
283
+ # Print the predicted caption.
284
+ # print("Predicted caption:")
285
+ # print(output_text)
286
+ # print()
287
+ return predicted_caption
288
+
289
+ def beam_search(beam_index, image_path, max_tokens=30):
290
+ image = load_image(image_path, size=(224,224))
291
+
292
+ # Expand the 3-dim numpy array to 4-dim
293
+ # because the image-model expects a whole batch as input,
294
+ # so we give it a batch with just one image.
295
+ image_batch = np.expand_dims(image, axis=0)
296
+
297
+ # Process the image with the pre-trained image-model
298
+ # to get the transfer-values.
299
+ transfer_values = image_model_transfer.predict(image_batch)
300
+
301
+ token_int = [token_start]
302
+ start_word = [[token_int, 0.0]]
303
+ count_tokens = 0
304
+ while len(start_word[0][0])<max_tokens:
305
+ temp = []
306
+
307
+ for s in start_word:
308
+ par_caps = pad_sequences([s[0]], maxlen=max_tokens, padding='post')
309
+ preds = decoder_model.predict([transfer_values,par_caps], verbose=0)
310
+ token_onehot = preds[0, count_tokens, :]
311
+ # print(token_onehot.shape)
312
+ word_preds = np.argsort(token_onehot)[-beam_index:]
313
+ # print(word_preds.shape)
314
+
315
+ for w in word_preds:
316
+ next_cap, prob = s[0][:], s[1]
317
+ next_cap.append(w)
318
+ prob += token_onehot[w]
319
+ temp.append([next_cap, prob])
320
+
321
+ start_word = temp
322
+ count_tokens+=1
323
+ # Sorting according to the probabilities
324
+ start_word = sorted(start_word, reverse=False, key=lambda l: l[1])
325
+ # Getting the top words
326
+ start_word = start_word[-beam_index:]
327
+
328
+ start_word = start_word[-1][0]
329
+ intermediate_caption = [tokenizer.token_to_word(i) for i in start_word]
330
+ final_caption = []
331
+
332
+ for i in intermediate_caption:
333
+ if i != 'eeee':
334
+ final_caption.append(i)
335
+ else:
336
+ break
337
+
338
+ # final_caption = ' '.join(final_caption[1:])
339
+ return final_caption[1:]
340
+
341
+ def generate_caption_any(image_path):
342
+ predicted_caption1=' '.join((greedy_search(image_path=image_path)))
343
+ predicted_caption2=' '.join(beam_search(beam_index=3,image_path=image_path))
344
+ predicted_caption3=' '.join(beam_search(beam_index=5,image_path=image_path))
345
+ return predicted_caption2
346
+ # show_image_using_path(image_path)
347
+
348
+
349
+
350
+ def main():
351
+ st.title("ISRO Video Classification & Captioning")
352
+ st.write('In this project, we introduce a technique for video classification and captioning, harnessing a keyframe extraction method to streamline the process. Utilizing Densenet 201, our model is designed to classify videos by focusing on the most crucial frame, optimizing efficiency and performance. Users can experience our innovative approach by employing any of the provided three videos or by uploading additional ISRO footage to witness the improved model in action.')
353
+
354
+ video_options = {
355
+ "Video 1": "Video001-Scene-001.mp4",
356
+ "Video 2": "Video015-Scene-074.mp4",
357
+ "Video 3": "Video005-Scene-043.mp4",
358
+ }
359
+
360
+ selected_video = st.selectbox("Select a video to submit", list(video_options.keys()))
361
+ video_path = video_options[selected_video]
362
+
363
+ if st.button("Submit"):
364
+ st.video(video_path)
365
+ path=final_image(video_path)
366
+ image=image_test(path)
367
+ output_class=model1.predict(image)
368
+ caption=generate_caption_any(path)
369
+ indices=largest_indices(output_class, 3)
370
+ st.title('The predicted category is:')
371
+ st.write(le.inverse_transform(indices)[0])
372
+ st.title('Caption:')
373
+ caption = caption.capitalize()
374
+ st.write(caption)
375
+ # st.video(uploaded_file)
376
+
377
+ if __name__ == "__main__":
378
+ main()
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ tensorflow
3
+ keras
4
+ pandas
5
+ numpy
6
+ pillow
7
+ opencv-python
8
+ scikit-learn
tokenizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8dbc8b1feb6fa1526be183d3e5df1151c4e63e6bda4c1d24543d3bf56a13fe7
3
+ size 38857