| | |
| | """compareVec2VecWithAda.ipynb |
| | |
| | Automatically generated by Colaboratory. |
| | |
| | Original file is located at |
| | https://colab.research.google.com/drive/1jPaNXdO0_oW6VczlWfm5RPUVpMtVQD9c |
| | """ |
| |
|
| | import pandas as pd |
| | import numpy as np |
| | import openai |
| | from sklearn.metrics.pairwise import cosine_similarity |
| | from tensorflow.keras.models import load_model |
| | from transformers import AutoTokenizer, AutoModel |
| | import torch |
| | import torch.nn.functional as F |
| |
|
| | |
| | tokenizer = AutoTokenizer.from_pretrained('all-mpnet-base-v2') |
| | model = AutoModel.from_pretrained('all-mpnet-base-v2') |
| |
|
| | |
| | def cosine_similarity_loss(y_true, y_pred): |
| | y_true = tf.nn.l2_normalize(y_true, axis=-1) |
| | y_pred = tf.nn.l2_normalize(y_pred, axis=-1) |
| | return -tf.reduce_mean(y_true * y_pred, axis=-1) |
| |
|
| |
|
| | |
| | def mean_pooling(model_output, attention_mask): |
| | token_embeddings = model_output[0] |
| | input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() |
| | return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) |
| |
|
| | loaded_model = load_model('mpnet2adaE75V4.h5', custom_objects={'cosine_similarity_loss': cosine_similarity_loss}) |
| |
|
| | openai.api_key="insert API key here" |
| |
|
| | |
| | df2 = pd.read_csv('Actual_Embeddings.csv') |
| |
|
| | |
| | df2['Actual_Embeddings'] = df2['Actual_Embeddings'].apply(eval).apply(np.array) |
| |
|
| |
|
| | def get_top_5_texts(query): |
| | encoded_input = tokenizer(query, padding=True, truncation=True, return_tensors='pt') |
| |
|
| | with torch.no_grad(): |
| | model_output = model(**encoded_input) |
| |
|
| | mpnetEmbeddings = mean_pooling(model_output, encoded_input['attention_mask']) |
| |
|
| | mpnetEmbeddings = F.normalize(mpnetEmbeddings, p=2, dim=1) |
| | mpnetEmbeddings = mpnetEmbeddings.detach().cpu().numpy() |
| | mpnetEmbeddings = np.reshape(mpnetEmbeddings, (1,-1)) |
| | query_embedding = loaded_model.predict(mpnetEmbeddings) |
| |
|
| | similarities = [cosine_similarity(query_embedding.reshape(1, -1), emb.reshape(1, -1))[0][0] for emb in df2['Actual_Embeddings']] |
| |
|
| | print("Converted MPNet Embedding Results:") |
| | top_5_idx2 = np.argsort(similarities)[-5:][::-1] |
| | for i, idx in enumerate(top_5_idx2, 1): |
| | print(f'Text {i}') |
| | print(df2['combined'].iloc[idx]) |
| | print("\n") |
| |
|
| | response = openai.Embedding.create(input=query, model="text-embedding-ada-002") |
| | query_embedding = np.array(response['data'][0]['embedding']) |
| | similarities2 = [cosine_similarity(query_embedding.reshape(1, -1), emb.reshape(1, -1))[0][0] for emb in df2['Actual_Embeddings']] |
| |
|
| | print("OpenAI Embedding Results:") |
| | top_5_idx2 = np.argsort(similarities2)[-5:][::-1] |
| | for i, idx in enumerate(top_5_idx2, 1): |
| | print(f'Text {i}') |
| | print(df2['combined'].iloc[idx]) |
| | print("\n") |
| |
|
| | while True: |
| | query = input("Enter your query: ") |
| | get_top_5_texts(query) |