Update app.py
Browse files
app.py
CHANGED
|
@@ -28,9 +28,10 @@ dataset = load_dataset("Namitg02/Test", split='train', streaming=False)
|
|
| 28 |
#Returns a list of dictionaries, each representing a row in the dataset.
|
| 29 |
print(dataset[1])
|
| 30 |
dataset.features
|
|
|
|
| 31 |
#Itemdetails = dataset.items()
|
| 32 |
#print(Itemdetails)
|
| 33 |
-
splitter = RecursiveCharacterTextSplitter(chunk_size=150, chunk_overlap=25) # ["\n\n", "\n", " ", ""])
|
| 34 |
|
| 35 |
#docs = splitter.create_documents(str(dataset))
|
| 36 |
|
|
@@ -46,8 +47,16 @@ embedding_model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")
|
|
| 46 |
|
| 47 |
#doc_func = lambda x: x.text
|
| 48 |
#dataset = list(map(doc_func, dataset))
|
| 49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 50 |
print(embeddings)
|
|
|
|
| 51 |
#def embedder(dataset[i]):
|
| 52 |
# return embedding_model.encode(dataset[i])
|
| 53 |
|
|
@@ -57,7 +66,7 @@ print(dataset[2])
|
|
| 57 |
#embeddings = embedding_model.encode(dataset)
|
| 58 |
|
| 59 |
#embeddings = embedding_model.embed_documents(docs)
|
| 60 |
-
|
| 61 |
embedding_dim = embedding_model.get_sentence_embedding_dimension()
|
| 62 |
print(dataset[1])
|
| 63 |
#data = FAISS.from_embeddings(embed, embedding_model)
|
|
|
|
| 28 |
#Returns a list of dictionaries, each representing a row in the dataset.
|
| 29 |
print(dataset[1])
|
| 30 |
dataset.features
|
| 31 |
+
length = len(dataset)
|
| 32 |
#Itemdetails = dataset.items()
|
| 33 |
#print(Itemdetails)
|
| 34 |
+
#splitter = RecursiveCharacterTextSplitter(chunk_size=150, chunk_overlap=25) # ["\n\n", "\n", " ", ""])
|
| 35 |
|
| 36 |
#docs = splitter.create_documents(str(dataset))
|
| 37 |
|
|
|
|
| 47 |
|
| 48 |
#doc_func = lambda x: x.text
|
| 49 |
#dataset = list(map(doc_func, dataset))
|
| 50 |
+
|
| 51 |
+
def embedder(dataset):
|
| 52 |
+
embeddings = embedding_model.encode(dataset[0]["text"])
|
| 53 |
+
dataset = dataset.add_column('embeddings', embeddings)
|
| 54 |
+
return dataset
|
| 55 |
+
updated_dataset = dataset.map(embedder)
|
| 56 |
+
dataset['text'][:length]
|
| 57 |
+
|
| 58 |
print(embeddings)
|
| 59 |
+
|
| 60 |
#def embedder(dataset[i]):
|
| 61 |
# return embedding_model.encode(dataset[i])
|
| 62 |
|
|
|
|
| 66 |
#embeddings = embedding_model.encode(dataset)
|
| 67 |
|
| 68 |
#embeddings = embedding_model.embed_documents(docs)
|
| 69 |
+
|
| 70 |
embedding_dim = embedding_model.get_sentence_embedding_dimension()
|
| 71 |
print(dataset[1])
|
| 72 |
#data = FAISS.from_embeddings(embed, embedding_model)
|