destinyebuka commited on
Commit
b398556
·
1 Parent(s): 03ebe50
Files changed (29) hide show
  1. app/ai/routes/chat.py +18 -23
  2. app/ai/tools/listing_tool.py +229 -55
  3. app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/README.md +0 -173
  4. app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/data_config.json +0 -1452
  5. app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/model.safetensors +0 -3
  6. app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/onnx/model.onnx +0 -3
  7. app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/onnx/model_O1.onnx +0 -3
  8. app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/onnx/model_O2.onnx +0 -3
  9. app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/onnx/model_O3.onnx +0 -3
  10. app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/onnx/model_O4.onnx +0 -3
  11. app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/onnx/model_qint8_arm64.onnx +0 -3
  12. app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/onnx/model_qint8_avx512.onnx +0 -3
  13. app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/onnx/model_qint8_avx512_vnni.onnx +0 -3
  14. app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/onnx/model_quint8_avx2.onnx +0 -3
  15. app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/openvino/openvino_model.bin +0 -3
  16. app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/openvino/openvino_model.xml +0 -0
  17. app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/openvino/openvino_model_qint8_quantized.bin +0 -3
  18. app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/openvino/openvino_model_qint8_quantized.xml +0 -0
  19. app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/pytorch_model.bin +0 -3
  20. app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/tokenizer.json +0 -0
  21. app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/train_script.py +0 -344
  22. app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/vocab.txt +0 -0
  23. models/models--sentence-transformers--all-MiniLM-L6-v2/blobs/53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db +0 -3
  24. models/models--sentence-transformers--all-MiniLM-L6-v2/blobs/58d4a9a45664eb9e12de9549c548c09b6134c17f +0 -3
  25. models/models--sentence-transformers--all-MiniLM-L6-v2/blobs/cb202bfe2e3c98645018a6d12f182a434c9d3e02 +0 -3
  26. models/models--sentence-transformers--all-MiniLM-L6-v2/blobs/fb140275c155a9c7c5a3b3e0e77a9e839594a938 +0 -3
  27. models/sentence-transformers_all-MiniLM-L6-v2/README.md +0 -3
  28. models/sentence-transformers_all-MiniLM-L6-v2/onnx/model_qint8_avx512.onnx +0 -3
  29. models/sentence-transformers_all-MiniLM-L6-v2/onnx/model_qint8_avx512_vnni.onnx +0 -3
app/ai/routes/chat.py CHANGED
@@ -1,5 +1,5 @@
1
- # app/ai/routes/chat.py - FINAL VERSION
2
- # Sends only draft data, Flutter builds native UI
3
 
4
  from fastapi import APIRouter, Depends, HTTPException, BackgroundTasks
5
  from fastapi.security import HTTPBearer
@@ -33,7 +33,6 @@ class AskBody(BaseModel):
33
  session_id: Optional[str] = None
34
  thread_id: Optional[str] = None
35
  start_new_session: Optional[bool] = False
36
- image_urls: Optional[List[str]] = None # URLs from Cloudflare (client-side uploaded)
37
 
38
 
39
  class ChatResponse(BaseModel):
@@ -42,6 +41,7 @@ class ChatResponse(BaseModel):
42
  action: str
43
  state: Optional[Dict[str, Any]] = None
44
  draft: Optional[Dict[str, Any]] = None
 
45
  mongo_id: Optional[str] = None
46
  error: Optional[str] = None
47
 
@@ -118,9 +118,13 @@ async def ask_ai(
118
  """
119
  Main chat endpoint with:
120
  - Greeting detection & response
121
- - Simplified listing flow
122
- - Image URL handling from Cloudflare
123
- - Returns only draft data (NO HTML)
 
 
 
 
124
 
125
  Flow:
126
  1. Authenticate
@@ -128,7 +132,7 @@ async def ask_ai(
128
  3. Get/create memory
129
  4. Check for greeting
130
  5. Detect intent (listing, publish, edit, etc.)
131
- 6. Process accordingly
132
  7. Return response
133
  """
134
 
@@ -157,7 +161,6 @@ async def ask_ai(
157
  user_id=user_id,
158
  session_id=session_id,
159
  status=context.get("status"),
160
- has_images=bool(body.image_urls),
161
  )
162
 
163
  # CHECK RESET
@@ -173,7 +176,6 @@ async def ask_ai(
173
  context["user_role"] = user_role
174
  await memory.update_context(context)
175
  await memory.clear()
176
- # Continue with normal message processing
177
 
178
  # INIT CONTEXT IF NEW
179
  if not context:
@@ -200,11 +202,9 @@ async def ask_ai(
200
  user_role=user_role
201
  )
202
 
203
- # Add to history
204
  await memory.add_message("user", body.message)
205
  await memory.add_message("assistant", greeting_result["reply"])
206
 
207
- # Update context
208
  context["last_activity"] = datetime.utcnow().isoformat()
209
  context["status"] = greeting_result["state"].get("status", "idle")
210
  await memory.update_context(context)
@@ -231,10 +231,7 @@ async def ask_ai(
231
  "images": [],
232
  })
233
 
234
- # ✅ ADD IMAGE URLs from Cloudflare (client-side uploaded)
235
- if body.image_urls:
236
- logger.info(f"Adding {len(body.image_urls)} image URLs to listing", user_id=user_id)
237
- listing_state["images"].extend(body.image_urls)
238
 
239
  # Process listing
240
  result = await process_listing(
@@ -242,7 +239,6 @@ async def ask_ai(
242
  user_id=user_id,
243
  user_role=user_role,
244
  current_state=listing_state,
245
- image_urls=listing_state.get("images", []),
246
  )
247
 
248
  # Update context
@@ -264,7 +260,8 @@ async def ask_ai(
264
  text=result["reply"],
265
  action=result["action"],
266
  state=context,
267
- draft=result.get("draft"), # ✅ Only send draft data, NO HTML
 
268
  error=result.get("error")
269
  )
270
 
@@ -279,11 +276,11 @@ async def ask_ai(
279
  # from app.database import get_db
280
  # db = await get_db()
281
  # listing = await db.listings.insert_one(draft)
282
- # listing_id = listing.inserted_id
283
 
284
  logger.info("Listing published", user_id=user_id, title=draft.get("title"))
285
 
286
- # Clear listing state
287
  context["status"] = "idle"
288
  context["listing_state"] = {}
289
  context["draft"] = None
@@ -368,7 +365,7 @@ async def ask_ai(
368
  text=reply,
369
  action="show_draft",
370
  state=context,
371
- draft=draft, # ✅ Only send draft data, NO HTML
372
  )
373
 
374
  # 5. DISCARD DRAFT
@@ -407,11 +404,9 @@ async def ask_ai(
407
  conversation_context=context
408
  )
409
 
410
- # Add to history
411
  await memory.add_message("user", body.message)
412
  await memory.add_message("assistant", reply)
413
 
414
- # Update context
415
  context["last_activity"] = datetime.utcnow().isoformat()
416
  if "tool" in tool_result:
417
  context["last_tool"] = tool_result["tool"]
@@ -460,7 +455,7 @@ async def health_check():
460
  """Health check for chat service"""
461
  return {
462
  "status": "healthy",
463
- "service": "Aida Chat with Native Flutter UI",
464
  "langsmith": "enabled" if os.getenv("LANGCHAIN_API_KEY") else "disabled",
465
  }
466
 
 
1
+ # app/ai/routes/chat.py - UPDATED FOR FRONTEND IMAGE URLs
2
+ # Images are now extracted from message text, no separate upload endpoint needed
3
 
4
  from fastapi import APIRouter, Depends, HTTPException, BackgroundTasks
5
  from fastapi.security import HTTPBearer
 
33
  session_id: Optional[str] = None
34
  thread_id: Optional[str] = None
35
  start_new_session: Optional[bool] = False
 
36
 
37
 
38
  class ChatResponse(BaseModel):
 
41
  action: str
42
  state: Optional[Dict[str, Any]] = None
43
  draft: Optional[Dict[str, Any]] = None
44
+ draft_ui: Optional[Dict[str, Any]] = None # UI component for draft preview
45
  mongo_id: Optional[str] = None
46
  error: Optional[str] = None
47
 
 
118
  """
119
  Main chat endpoint with:
120
  - Greeting detection & response
121
+ - Simplified listing flow with IMAGE URL EXTRACTION from message
122
+ - Returns draft data WITH UI COMPONENT
123
+
124
+ Image Handling:
125
+ - Frontend uploads image and gets URL
126
+ - User sends message: "Here's the property image: https://..."
127
+ - Aida extracts URL from message and stores it
128
 
129
  Flow:
130
  1. Authenticate
 
132
  3. Get/create memory
133
  4. Check for greeting
134
  5. Detect intent (listing, publish, edit, etc.)
135
+ 6. Process accordingly (URLs extracted from messages)
136
  7. Return response
137
  """
138
 
 
161
  user_id=user_id,
162
  session_id=session_id,
163
  status=context.get("status"),
 
164
  )
165
 
166
  # CHECK RESET
 
176
  context["user_role"] = user_role
177
  await memory.update_context(context)
178
  await memory.clear()
 
179
 
180
  # INIT CONTEXT IF NEW
181
  if not context:
 
202
  user_role=user_role
203
  )
204
 
 
205
  await memory.add_message("user", body.message)
206
  await memory.add_message("assistant", greeting_result["reply"])
207
 
 
208
  context["last_activity"] = datetime.utcnow().isoformat()
209
  context["status"] = greeting_result["state"].get("status", "idle")
210
  await memory.update_context(context)
 
231
  "images": [],
232
  })
233
 
234
+ # ✅ NO SEPARATE IMAGE URLS - process_listing will extract from message
 
 
 
235
 
236
  # Process listing
237
  result = await process_listing(
 
239
  user_id=user_id,
240
  user_role=user_role,
241
  current_state=listing_state,
 
242
  )
243
 
244
  # Update context
 
260
  text=result["reply"],
261
  action=result["action"],
262
  state=context,
263
+ draft=result.get("draft"),
264
+ draft_ui=result.get("draft_ui"), # UI component
265
  error=result.get("error")
266
  )
267
 
 
276
  # from app.database import get_db
277
  # db = await get_db()
278
  # listing = await db.listings.insert_one(draft)
279
+ # mongo_id = str(listing.inserted_id)
280
 
281
  logger.info("Listing published", user_id=user_id, title=draft.get("title"))
282
 
283
+ # CLEAR LISTING STATE AND SET STATUS TO IDLE
284
  context["status"] = "idle"
285
  context["listing_state"] = {}
286
  context["draft"] = None
 
365
  text=reply,
366
  action="show_draft",
367
  state=context,
368
+ draft=draft,
369
  )
370
 
371
  # 5. DISCARD DRAFT
 
404
  conversation_context=context
405
  )
406
 
 
407
  await memory.add_message("user", body.message)
408
  await memory.add_message("assistant", reply)
409
 
 
410
  context["last_activity"] = datetime.utcnow().isoformat()
411
  if "tool" in tool_result:
412
  context["last_tool"] = tool_result["tool"]
 
455
  """Health check for chat service"""
456
  return {
457
  "status": "healthy",
458
+ "service": "Aida Chat with Frontend Image URLs",
459
  "langsmith": "enabled" if os.getenv("LANGCHAIN_API_KEY") else "disabled",
460
  }
461
 
app/ai/tools/listing_tool.py CHANGED
@@ -1,8 +1,8 @@
1
  # app/ai/tools/listing_tool.py
2
- # FINAL VERSION: Simplified listing logic - NO HTML generation
3
- # Backend sends only draft data, Flutter builds native UI
4
 
5
  import json
 
6
  from typing import Dict, Optional, Tuple, List
7
  from pydantic import BaseModel, Field
8
  from structlog import get_logger
@@ -29,44 +29,142 @@ llm = ChatOpenAI(
29
  )
30
 
31
 
32
- # ========== STEP 1: SHOW EXAMPLE ==========
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
  async def generate_listing_example(user_language: str, user_role: str) -> str:
35
  """
36
- Generate a SHORT, realistic listing example in user's language.
37
- Different each time (not hardcoded).
 
 
 
 
 
 
38
 
39
- Shows all fields: bedrooms, bathrooms, location, price, amenities, requirements
40
- Format: Natural sentence (NOT a list)
41
  """
42
 
43
- logger.info("Generating listing example", language=user_language, role=user_role)
44
 
45
  try:
46
  role_context = "as a landlord renting an apartment" if user_role == "landlord" else "as a renter looking for a roommate to share your apartment"
47
 
48
- prompt = f"""Generate a SHORT, realistic property listing example {role_context} in {user_language}.
 
 
 
 
 
 
 
 
49
 
50
  Requirements:
51
  - Keep it 2-3 sentences MAXIMUM
52
- - Include ALL of these: location, bedrooms, bathrooms, price, price_type, at least one amenity, one requirement
53
  - Format: Natural sentence (NOT a list or bullet points)
54
  - Language: Respond ONLY in {user_language}, no mixing
55
  - Realistic: Use real cities and reasonable prices
 
56
 
57
  Example format (DO NOT copy exactly):
58
- "I have a 2-bedroom, 1-bathroom apartment in [city] for [price] per [time] with wifi and parking. Tenant must provide 3-month deposit."
59
 
60
- Now generate YOUR OWN unique example in {user_language}:"""
61
 
62
  response = await llm.ainvoke([
63
- SystemMessage(content="You are Aida, a real estate assistant. Generate SHORT, realistic property listing examples. Keep them natural, conversational, under 3 sentences."),
64
- HumanMessage(content=prompt)
65
  ])
66
 
67
  example = response.content if hasattr(response, 'content') else str(response)
68
 
69
- logger.info("Example generated successfully", length=len(example))
70
  return example.strip()
71
 
72
  except Exception as e:
@@ -102,6 +200,7 @@ Extract these fields (set to null if not mentioned):
102
  Important:
103
  - Be smart about understanding intent (typos, informal language)
104
  - Extract numbers from text (e.g., "2bd" = 2, "50k" = 50000)
 
105
  - Return ONLY valid JSON, nothing else
106
 
107
  Return JSON ONLY:
@@ -150,17 +249,7 @@ async def auto_detect_listing_type(
150
  user_role: str,
151
  user_message: str = ""
152
  ) -> str:
153
- """
154
- Auto-detect listing type based on SIMPLE RULES:
155
-
156
- For Landlord:
157
- - monthly OR yearly → "rent"
158
- - weekly OR daily OR nightly → "short-stay"
159
- - "for sale" OR "selling" in message → "sale"
160
-
161
- For Renter:
162
- - ALWAYS → "roommate"
163
- """
164
 
165
  if user_role == "renter":
166
  return "roommate"
@@ -182,10 +271,7 @@ async def auto_detect_listing_type(
182
  # ========== STEP 4: AUTO-DETECT CURRENCY ==========
183
 
184
  async def get_currency_for_location(location: str) -> str:
185
- """
186
- Get currency for location using ML extractor.
187
- ML extractor handles geolocation + currency detection.
188
- """
189
 
190
  try:
191
  currency, city, confidence = await ml_extractor.infer_currency(
@@ -213,7 +299,7 @@ async def get_currency_for_location(location: str) -> str:
213
  "london": "GBP", "manchester": "GBP", "edinburgh": "GBP",
214
  "paris": "EUR", "lyon": "EUR", "marseille": "EUR",
215
  "madrid": "EUR", "barcelona": "EUR", "valencia": "EUR",
216
- "newyork": "USD", "new york": "USD", "losangeles": "USD", "chicago": "USD",
217
  "portland": "USD", "seattle": "USD", "san francisco": "USD",
218
  }
219
 
@@ -312,6 +398,59 @@ Return ONLY valid JSON:
312
  return title, description
313
 
314
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
315
  # ========== MAIN PROCESS LISTING ==========
316
 
317
  async def process_listing(
@@ -319,18 +458,18 @@ async def process_listing(
319
  user_id: str,
320
  user_role: str,
321
  current_state: Optional[Dict] = None,
322
- image_urls: Optional[List[str]] = None,
323
  ) -> Dict:
324
  """
325
- Process listing with SIMPLIFIED LOGIC:
326
-
327
- 1. Show example first time
328
- 2. Extract fields
329
- 3. Ask missing required fields ONE AT A TIME
330
- 4. Ask about amenities/requirements ONCE
331
- 5. Auto-detect: currency, listing_type, title, description
332
- 6. Generate draft
333
- 7. Return draft for Flutter UI to display
 
334
  """
335
 
336
  logger.info("Processing listing", user_id=user_id, user_role=user_role)
@@ -339,19 +478,31 @@ async def process_listing(
339
  "status": "listing",
340
  "step": "initial",
341
  "provided_fields": {},
342
- "images": image_urls or [],
343
  }
344
 
345
- # STEP 1: Show example if first time
 
 
 
 
 
 
 
 
 
 
 
 
346
  if state.get("step") == "initial":
347
- logger.info("First time listing - showing example")
348
 
349
  example = await generate_listing_example("en", user_role) # TODO: Detect user language
350
 
351
  return {
352
  "success": True,
353
  "action": "show_example",
354
- "reply": f"Great! 🏠 Here's an example of how you could describe it:\n\n\"{example}\"\n\nNow tell me about your property.",
355
  "data": {},
356
  "state": {
357
  "status": "listing",
@@ -370,7 +521,7 @@ async def process_listing(
370
  if value is not None and value != [] and value != "":
371
  provided_fields[key] = value
372
 
373
- logger.info("Fields collected so far", provided=list(provided_fields.keys()))
374
 
375
  # STEP 3: Check for missing required fields
376
  missing_fields = [f for f in REQUIRED_FIELDS if f not in provided_fields or provided_fields[f] is None]
@@ -400,7 +551,7 @@ async def process_listing(
400
  "step": "collecting_fields",
401
  "provided_fields": provided_fields,
402
  "missing_fields": missing_fields,
403
- "images": state.get("images", []),
404
  }
405
  }
406
 
@@ -417,11 +568,30 @@ async def process_listing(
417
  "status": "listing",
418
  "step": "collecting_optional",
419
  "provided_fields": provided_fields,
420
- "images": state.get("images", []),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
421
  }
422
  }
423
 
424
- # STEP 5: Auto-detect listing_type and currency
 
 
425
  listing_type = await auto_detect_listing_type(
426
  price_type=provided_fields.get("price_type", ""),
427
  user_role=user_role,
@@ -439,10 +609,10 @@ async def process_listing(
439
  user_id=user_id
440
  )
441
 
442
- # STEP 6: Generate title and description
443
  title, description = await generate_title_and_description(provided_fields, user_role)
444
 
445
- # STEP 7: Build draft
446
  draft = {
447
  "user_id": user_id,
448
  "user_role": user_role,
@@ -457,12 +627,15 @@ async def process_listing(
457
  "listing_type": provided_fields.get("listing_type"),
458
  "amenities": provided_fields.get("amenities", []),
459
  "requirements": provided_fields.get("requirements"),
460
- "images": state.get("images", []), # Images from Cloudflare (client-side upload)
461
  }
462
 
463
- logger.info("Draft ready for preview", title=title, location=provided_fields.get("location"))
 
 
 
464
 
465
- # STEP 8: Return draft (Flutter builds the UI)
466
  return {
467
  "success": True,
468
  "action": "show_draft",
@@ -472,7 +645,8 @@ async def process_listing(
472
  "status": "listing",
473
  "step": "preview_ready",
474
  "provided_fields": provided_fields,
475
- "images": state.get("images", []),
476
  },
477
  "draft": draft,
 
478
  }
 
1
  # app/ai/tools/listing_tool.py
2
+ # FINAL VERSION: Random examples + AI-powered URL extraction
 
3
 
4
  import json
5
+ import re
6
  from typing import Dict, Optional, Tuple, List
7
  from pydantic import BaseModel, Field
8
  from structlog import get_logger
 
29
  )
30
 
31
 
32
+ # ========== AI-POWERED URL EXTRACTION ==========
33
+
34
+ async def extract_image_urls_from_message(user_message: str) -> List[str]:
35
+ """
36
+ AI-powered image URL extraction using LLM.
37
+
38
+ The LLM is smarter than regex:
39
+ - Understands context
40
+ - Handles edge cases
41
+ - Filters out non-image URLs
42
+ - Extracts from various formats
43
+
44
+ Returns:
45
+ List of image URLs found in message
46
+ """
47
+
48
+ logger.info("Extracting image URLs with AI", msg_len=len(user_message))
49
+
50
+ try:
51
+ prompt_text = f"""Extract image URLs from this user message.
52
+
53
+ User message: "{user_message}"
54
+
55
+ Your task:
56
+ 1. Look for URLs in the message
57
+ 2. Identify which ones are likely image URLs (jpg, png, gif, webp, cloudflare, etc.)
58
+ 3. Extract ONLY image URLs, NOT other types
59
+ 4. Return as JSON array
60
+
61
+ Important:
62
+ - Image URLs usually end in: .jpg, .png, .gif, .webp, or contain "imagedelivery", "cloudinary", "imgur", etc.
63
+ - Include full URLs with https://
64
+ - Exclude URLs that are clearly not images (don't include docs, videos, etc.)
65
+ - If no image URLs found, return empty array
66
+ - Return ONLY valid JSON, nothing else
67
+
68
+ Return JSON ONLY:
69
+ {{
70
+ "urls": ["https://...", "https://..."] or []
71
+ }}"""
72
+
73
+ messages = [
74
+ SystemMessage(content="You are a URL extraction expert. Identify and extract image URLs from text. Return ONLY valid JSON with 'urls' array."),
75
+ HumanMessage(content=prompt_text)
76
+ ]
77
+
78
+ response = await llm.ainvoke(messages)
79
+ response_text = response.content if hasattr(response, 'content') else str(response)
80
+
81
+ logger.info("LLM extraction response", response=response_text[:100])
82
+
83
+ # Parse JSON from response
84
+ try:
85
+ result = json.loads(response_text)
86
+ urls = result.get("urls", [])
87
+
88
+ # Validate URLs
89
+ valid_urls = []
90
+ for url in urls:
91
+ if isinstance(url, str) and (url.startswith("http://") or url.startswith("https://")):
92
+ valid_urls.append(url)
93
+
94
+ if valid_urls:
95
+ logger.info("Extracted image URLs with AI", count=len(valid_urls), urls=[u[:60] + "..." for u in valid_urls])
96
+
97
+ return valid_urls
98
+
99
+ except json.JSONDecodeError:
100
+ # Try to extract JSON from response
101
+ json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
102
+ if json_match:
103
+ try:
104
+ result = json.loads(json_match.group())
105
+ urls = result.get("urls", [])
106
+ return [u for u in urls if isinstance(u, str) and u.startswith(("http://", "https://"))]
107
+ except:
108
+ return []
109
+ return []
110
+
111
+ except Exception as e:
112
+ logger.error("AI URL extraction failed", exc_info=e)
113
+ return []
114
+
115
+
116
+ # ========== STEP 1: SHOW RANDOM EXAMPLE ==========
117
 
118
  async def generate_listing_example(user_language: str, user_role: str) -> str:
119
  """
120
+ Generate a RANDOM, unique listing example each time.
121
+
122
+ Different every time because:
123
+ - Random locations
124
+ - Random prices
125
+ - Random amenities
126
+ - Random requirements
127
+ - Different phrasing/structure
128
 
129
+ Result: Users never see the same example twice!
 
130
  """
131
 
132
+ logger.info("Generating random listing example", language=user_language, role=user_role)
133
 
134
  try:
135
  role_context = "as a landlord renting an apartment" if user_role == "landlord" else "as a renter looking for a roommate to share your apartment"
136
 
137
+ prompt_text = f"""Generate a UNIQUE, realistic property listing example {role_context} in {user_language}.
138
+
139
+ IMPORTANT: Generate a DIFFERENT example each time. Vary:
140
+ - Location (city name, area)
141
+ - Number of bedrooms/bathrooms
142
+ - Price amount
143
+ - Amenities (different set each time)
144
+ - Requirements (different each time)
145
+ - Phrasing and structure
146
 
147
  Requirements:
148
  - Keep it 2-3 sentences MAXIMUM
149
+ - Include ALL of these fields: location, bedrooms, bathrooms, price, price_type, at least one amenity, one requirement
150
  - Format: Natural sentence (NOT a list or bullet points)
151
  - Language: Respond ONLY in {user_language}, no mixing
152
  - Realistic: Use real cities and reasonable prices
153
+ - DIFFERENT: Make it unique from previous examples
154
 
155
  Example format (DO NOT copy exactly):
156
+ "I have a 2-bedroom, 1-bathroom apartment in [CITY] for [PRICE] per [TIME] with [AMENITY1] and [AMENITY2]. [REQUIREMENT]."
157
 
158
+ Now generate YOUR OWN unique example in {user_language}. Make it different from typical examples:"""
159
 
160
  response = await llm.ainvoke([
161
+ SystemMessage(content="You are Aida, a creative real estate assistant. Generate UNIQUE, realistic property listing examples. Keep them natural, conversational, under 3 sentences. Each example should be different from the last."),
162
+ HumanMessage(content=prompt_text)
163
  ])
164
 
165
  example = response.content if hasattr(response, 'content') else str(response)
166
 
167
+ logger.info("Random example generated successfully", length=len(example))
168
  return example.strip()
169
 
170
  except Exception as e:
 
200
  Important:
201
  - Be smart about understanding intent (typos, informal language)
202
  - Extract numbers from text (e.g., "2bd" = 2, "50k" = 50000)
203
+ - IGNORE URLs - do NOT try to extract fields from URLs
204
  - Return ONLY valid JSON, nothing else
205
 
206
  Return JSON ONLY:
 
249
  user_role: str,
250
  user_message: str = ""
251
  ) -> str:
252
+ """Auto-detect listing type based on SIMPLE RULES."""
 
 
 
 
 
 
 
 
 
 
253
 
254
  if user_role == "renter":
255
  return "roommate"
 
271
  # ========== STEP 4: AUTO-DETECT CURRENCY ==========
272
 
273
  async def get_currency_for_location(location: str) -> str:
274
+ """Get currency for location using ML extractor."""
 
 
 
275
 
276
  try:
277
  currency, city, confidence = await ml_extractor.infer_currency(
 
299
  "london": "GBP", "manchester": "GBP", "edinburgh": "GBP",
300
  "paris": "EUR", "lyon": "EUR", "marseille": "EUR",
301
  "madrid": "EUR", "barcelona": "EUR", "valencia": "EUR",
302
+ "austin": "USD", "newyork": "USD", "new york": "USD", "losangeles": "USD", "chicago": "USD",
303
  "portland": "USD", "seattle": "USD", "san francisco": "USD",
304
  }
305
 
 
398
  return title, description
399
 
400
 
401
+ # ========== BUILD DRAFT UI COMPONENT ==========
402
+
403
+ def build_draft_ui_component(draft: Dict) -> Dict:
404
+ """
405
+ Build UI component data for draft preview.
406
+ Frontend uses this to render the draft preview UI.
407
+ """
408
+
409
+ amenities_icons = {
410
+ "wifi": "📶",
411
+ "parking": "🅿️",
412
+ "furnished": "🛋️",
413
+ "washing machine": "🧼",
414
+ "dryer": "🌪️",
415
+ "ac": "🌬️",
416
+ "air conditioning": "🌬️",
417
+ "balcony": "🏠",
418
+ "pool": "🏊",
419
+ "gym": "💪",
420
+ "garden": "🌳",
421
+ "kitchen": "🍳",
422
+ }
423
+
424
+ # Build amenities with icons
425
+ amenities = draft.get("amenities", [])
426
+ amenities_display = []
427
+ for amenity in amenities:
428
+ icon = amenities_icons.get(amenity.lower(), "✓")
429
+ amenities_display.append(f"{icon} {amenity.capitalize()}")
430
+
431
+ ui_component = {
432
+ "component_type": "listing_draft_preview",
433
+ "title": draft.get("title"),
434
+ "description": draft.get("description"),
435
+ "location": draft.get("location"),
436
+ "bedrooms": draft.get("bedrooms"),
437
+ "bathrooms": draft.get("bathrooms"),
438
+ "price": draft.get("price"),
439
+ "price_type": draft.get("price_type"),
440
+ "currency": draft.get("currency"),
441
+ "listing_type": draft.get("listing_type"),
442
+ "amenities": amenities,
443
+ "amenities_display": " | ".join(amenities_display) if amenities_display else "No amenities",
444
+ "requirements": draft.get("requirements") or "No special requirements",
445
+ "images": draft.get("images", []),
446
+ "images_count": len(draft.get("images", [])),
447
+ "user_id": draft.get("user_id"),
448
+ "actions": ["publish", "edit", "discard"],
449
+ }
450
+
451
+ return ui_component
452
+
453
+
454
  # ========== MAIN PROCESS LISTING ==========
455
 
456
  async def process_listing(
 
458
  user_id: str,
459
  user_role: str,
460
  current_state: Optional[Dict] = None,
 
461
  ) -> Dict:
462
  """
463
+ Process listing with UPDATED LOGIC:
464
+
465
+ 1. Show RANDOM example first time
466
+ 2. AI-extract image URLs from message
467
+ 3. Extract fields
468
+ 4. Ask missing required fields ONE AT A TIME
469
+ 5. Ask about amenities/requirements ONCE
470
+ 6. AUTO-REQUIRE AT LEAST 1 IMAGE BEFORE DRAFT
471
+ 7. Generate draft WITH UI COMPONENT
472
+ 8. Return draft for Flutter UI to display
473
  """
474
 
475
  logger.info("Processing listing", user_id=user_id, user_role=user_role)
 
478
  "status": "listing",
479
  "step": "initial",
480
  "provided_fields": {},
481
+ "images": [],
482
  }
483
 
484
+ # ========== AI-POWERED: EXTRACT IMAGE URLs FROM MESSAGE ==========
485
+ extracted_urls = await extract_image_urls_from_message(user_message)
486
+
487
+ # Add extracted URLs to images list (avoid duplicates)
488
+ current_images = state.get("images", [])
489
+ for url in extracted_urls:
490
+ if url not in current_images:
491
+ current_images.append(url)
492
+ logger.info(f"Added image URL from message via AI extraction", url=url[:60] + "...")
493
+
494
+ state["images"] = current_images
495
+
496
+ # STEP 1: Show RANDOM example if first time
497
  if state.get("step") == "initial":
498
+ logger.info("First time listing - generating random example")
499
 
500
  example = await generate_listing_example("en", user_role) # TODO: Detect user language
501
 
502
  return {
503
  "success": True,
504
  "action": "show_example",
505
+ "reply": f"Great! 🏠 Here's an example of how you could describe it:\n\n\"{example}\"\n\nNow tell me about your property. You can also upload images by sharing the image URL.",
506
  "data": {},
507
  "state": {
508
  "status": "listing",
 
521
  if value is not None and value != [] and value != "":
522
  provided_fields[key] = value
523
 
524
+ logger.info("Fields collected so far", provided=list(provided_fields.keys()), images=len(current_images))
525
 
526
  # STEP 3: Check for missing required fields
527
  missing_fields = [f for f in REQUIRED_FIELDS if f not in provided_fields or provided_fields[f] is None]
 
551
  "step": "collecting_fields",
552
  "provided_fields": provided_fields,
553
  "missing_fields": missing_fields,
554
+ "images": current_images,
555
  }
556
  }
557
 
 
568
  "status": "listing",
569
  "step": "collecting_optional",
570
  "provided_fields": provided_fields,
571
+ "images": current_images,
572
+ }
573
+ }
574
+
575
+ # STEP 5: CHECK FOR IMAGES - REQUIRE AT LEAST 1
576
+ if not current_images or len(current_images) == 0:
577
+ logger.info("No images provided - asking user to upload", user_id=user_id)
578
+
579
+ return {
580
+ "success": True,
581
+ "action": "ask_images",
582
+ "reply": "📷 Please share at least one image of your property by sending the image URL. Example: 'Here's the property image: https://imagedelivery.net/...' This helps buyers/renters see what they're getting!",
583
+ "data": provided_fields,
584
+ "state": {
585
+ "status": "listing",
586
+ "step": "waiting_for_images",
587
+ "provided_fields": provided_fields,
588
+ "images": [],
589
  }
590
  }
591
 
592
+ logger.info("Images provided", image_count=len(current_images), user_id=user_id)
593
+
594
+ # STEP 6: Auto-detect listing_type and currency
595
  listing_type = await auto_detect_listing_type(
596
  price_type=provided_fields.get("price_type", ""),
597
  user_role=user_role,
 
609
  user_id=user_id
610
  )
611
 
612
+ # STEP 7: Generate title and description
613
  title, description = await generate_title_and_description(provided_fields, user_role)
614
 
615
+ # STEP 8: Build draft
616
  draft = {
617
  "user_id": user_id,
618
  "user_role": user_role,
 
627
  "listing_type": provided_fields.get("listing_type"),
628
  "amenities": provided_fields.get("amenities", []),
629
  "requirements": provided_fields.get("requirements"),
630
+ "images": current_images, # Images from AI-extracted URLs
631
  }
632
 
633
+ # STEP 9: Build UI component for draft preview
634
+ draft_ui = build_draft_ui_component(draft)
635
+
636
+ logger.info("Draft with UI component ready for preview", title=title, location=provided_fields.get("location"), image_count=len(current_images))
637
 
638
+ # STEP 10: Return draft with UI component
639
  return {
640
  "success": True,
641
  "action": "show_draft",
 
645
  "status": "listing",
646
  "step": "preview_ready",
647
  "provided_fields": provided_fields,
648
+ "images": current_images,
649
  },
650
  "draft": draft,
651
+ "draft_ui": draft_ui, # ✅ UI component for frontend
652
  }
app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/README.md DELETED
@@ -1,173 +0,0 @@
1
- ---
2
- language: en
3
- license: apache-2.0
4
- library_name: sentence-transformers
5
- tags:
6
- - sentence-transformers
7
- - feature-extraction
8
- - sentence-similarity
9
- - transformers
10
- datasets:
11
- - s2orc
12
- - flax-sentence-embeddings/stackexchange_xml
13
- - ms_marco
14
- - gooaq
15
- - yahoo_answers_topics
16
- - code_search_net
17
- - search_qa
18
- - eli5
19
- - snli
20
- - multi_nli
21
- - wikihow
22
- - natural_questions
23
- - trivia_qa
24
- - embedding-data/sentence-compression
25
- - embedding-data/flickr30k-captions
26
- - embedding-data/altlex
27
- - embedding-data/simple-wiki
28
- - embedding-data/QQP
29
- - embedding-data/SPECTER
30
- - embedding-data/PAQ_pairs
31
- - embedding-data/WikiAnswers
32
- pipeline_tag: sentence-similarity
33
- ---
34
-
35
-
36
- # all-MiniLM-L6-v2
37
- This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.
38
-
39
- ## Usage (Sentence-Transformers)
40
- Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:
41
-
42
- ```
43
- pip install -U sentence-transformers
44
- ```
45
-
46
- Then you can use the model like this:
47
- ```python
48
- from sentence_transformers import SentenceTransformer
49
- sentences = ["This is an example sentence", "Each sentence is converted"]
50
-
51
- model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
52
- embeddings = model.encode(sentences)
53
- print(embeddings)
54
- ```
55
-
56
- ## Usage (HuggingFace Transformers)
57
- Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings.
58
-
59
- ```python
60
- from transformers import AutoTokenizer, AutoModel
61
- import torch
62
- import torch.nn.functional as F
63
-
64
- #Mean Pooling - Take attention mask into account for correct averaging
65
- def mean_pooling(model_output, attention_mask):
66
- token_embeddings = model_output[0] #First element of model_output contains all token embeddings
67
- input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
68
- return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
69
-
70
-
71
- # Sentences we want sentence embeddings for
72
- sentences = ['This is an example sentence', 'Each sentence is converted']
73
-
74
- # Load model from HuggingFace Hub
75
- tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
76
- model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
77
-
78
- # Tokenize sentences
79
- encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
80
-
81
- # Compute token embeddings
82
- with torch.no_grad():
83
- model_output = model(**encoded_input)
84
-
85
- # Perform pooling
86
- sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
87
-
88
- # Normalize embeddings
89
- sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
90
-
91
- print("Sentence embeddings:")
92
- print(sentence_embeddings)
93
- ```
94
-
95
- ------
96
-
97
- ## Background
98
-
99
- The project aims to train sentence embedding models on very large sentence level datasets using a self-supervised
100
- contrastive learning objective. We used the pretrained [`nreimers/MiniLM-L6-H384-uncased`](https://huggingface.co/nreimers/MiniLM-L6-H384-uncased) model and fine-tuned in on a
101
- 1B sentence pairs dataset. We use a contrastive learning objective: given a sentence from the pair, the model should predict which out of a set of randomly sampled other sentences, was actually paired with it in our dataset.
102
-
103
- We developed this model during the
104
- [Community week using JAX/Flax for NLP & CV](https://discuss.huggingface.co/t/open-to-the-community-community-week-using-jax-flax-for-nlp-cv/7104),
105
- organized by Hugging Face. We developed this model as part of the project:
106
- [Train the Best Sentence Embedding Model Ever with 1B Training Pairs](https://discuss.huggingface.co/t/train-the-best-sentence-embedding-model-ever-with-1b-training-pairs/7354). We benefited from efficient hardware infrastructure to run the project: 7 TPUs v3-8, as well as intervention from Googles Flax, JAX, and Cloud team member about efficient deep learning frameworks.
107
-
108
- ## Intended uses
109
-
110
- Our model is intended to be used as a sentence and short paragraph encoder. Given an input text, it outputs a vector which captures
111
- the semantic information. The sentence vector may be used for information retrieval, clustering or sentence similarity tasks.
112
-
113
- By default, input text longer than 256 word pieces is truncated.
114
-
115
-
116
- ## Training procedure
117
-
118
- ### Pre-training
119
-
120
- We use the pretrained [`nreimers/MiniLM-L6-H384-uncased`](https://huggingface.co/nreimers/MiniLM-L6-H384-uncased) model. Please refer to the model card for more detailed information about the pre-training procedure.
121
-
122
- ### Fine-tuning
123
-
124
- We fine-tune the model using a contrastive objective. Formally, we compute the cosine similarity from each possible sentence pairs from the batch.
125
- We then apply the cross entropy loss by comparing with true pairs.
126
-
127
- #### Hyper parameters
128
-
129
- We trained our model on a TPU v3-8. We train the model during 100k steps using a batch size of 1024 (128 per TPU core).
130
- We use a learning rate warm up of 500. The sequence length was limited to 128 tokens. We used the AdamW optimizer with
131
- a 2e-5 learning rate. The full training script is accessible in this current repository: `train_script.py`.
132
-
133
- #### Training data
134
-
135
- We use the concatenation from multiple datasets to fine-tune our model. The total number of sentence pairs is above 1 billion sentences.
136
- We sampled each dataset given a weighted probability which configuration is detailed in the `data_config.json` file.
137
-
138
-
139
- | Dataset | Paper | Number of training tuples |
140
- |--------------------------------------------------------|:----------------------------------------:|:--------------------------:|
141
- | [Reddit comments (2015-2018)](https://github.com/PolyAI-LDN/conversational-datasets/tree/master/reddit) | [paper](https://arxiv.org/abs/1904.06472) | 726,484,430 |
142
- | [S2ORC](https://github.com/allenai/s2orc) Citation pairs (Abstracts) | [paper](https://aclanthology.org/2020.acl-main.447/) | 116,288,806 |
143
- | [WikiAnswers](https://github.com/afader/oqa#wikianswers-corpus) Duplicate question pairs | [paper](https://doi.org/10.1145/2623330.2623677) | 77,427,422 |
144
- | [PAQ](https://github.com/facebookresearch/PAQ) (Question, Answer) pairs | [paper](https://arxiv.org/abs/2102.07033) | 64,371,441 |
145
- | [S2ORC](https://github.com/allenai/s2orc) Citation pairs (Titles) | [paper](https://aclanthology.org/2020.acl-main.447/) | 52,603,982 |
146
- | [S2ORC](https://github.com/allenai/s2orc) (Title, Abstract) | [paper](https://aclanthology.org/2020.acl-main.447/) | 41,769,185 |
147
- | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Body) pairs | - | 25,316,456 |
148
- | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title+Body, Answer) pairs | - | 21,396,559 |
149
- | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Answer) pairs | - | 21,396,559 |
150
- | [MS MARCO](https://microsoft.github.io/msmarco/) triplets | [paper](https://doi.org/10.1145/3404835.3462804) | 9,144,553 |
151
- | [GOOAQ: Open Question Answering with Diverse Answer Types](https://github.com/allenai/gooaq) | [paper](https://arxiv.org/pdf/2104.08727.pdf) | 3,012,496 |
152
- | [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Answer) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 1,198,260 |
153
- | [Code Search](https://huggingface.co/datasets/code_search_net) | - | 1,151,414 |
154
- | [COCO](https://cocodataset.org/#home) Image captions | [paper](https://link.springer.com/chapter/10.1007%2F978-3-319-10602-1_48) | 828,395|
155
- | [SPECTER](https://github.com/allenai/specter) citation triplets | [paper](https://doi.org/10.18653/v1/2020.acl-main.207) | 684,100 |
156
- | [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Question, Answer) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 681,164 |
157
- | [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Question) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 659,896 |
158
- | [SearchQA](https://huggingface.co/datasets/search_qa) | [paper](https://arxiv.org/abs/1704.05179) | 582,261 |
159
- | [Eli5](https://huggingface.co/datasets/eli5) | [paper](https://doi.org/10.18653/v1/p19-1346) | 325,475 |
160
- | [Flickr 30k](https://shannon.cs.illinois.edu/DenotationGraph/) | [paper](https://transacl.org/ojs/index.php/tacl/article/view/229/33) | 317,695 |
161
- | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (titles) | | 304,525 |
162
- | AllNLI ([SNLI](https://nlp.stanford.edu/projects/snli/) and [MultiNLI](https://cims.nyu.edu/~sbowman/multinli/) | [paper SNLI](https://doi.org/10.18653/v1/d15-1075), [paper MultiNLI](https://doi.org/10.18653/v1/n18-1101) | 277,230 |
163
- | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (bodies) | | 250,519 |
164
- | [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (titles+bodies) | | 250,460 |
165
- | [Sentence Compression](https://github.com/google-research-datasets/sentence-compression) | [paper](https://www.aclweb.org/anthology/D13-1155/) | 180,000 |
166
- | [Wikihow](https://github.com/pvl/wikihow_pairs_dataset) | [paper](https://arxiv.org/abs/1810.09305) | 128,542 |
167
- | [Altlex](https://github.com/chridey/altlex/) | [paper](https://aclanthology.org/P16-1135.pdf) | 112,696 |
168
- | [Quora Question Triplets](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) | - | 103,663 |
169
- | [Simple Wikipedia](https://cs.pomona.edu/~dkauchak/simplification/) | [paper](https://www.aclweb.org/anthology/P11-2117/) | 102,225 |
170
- | [Natural Questions (NQ)](https://ai.google.com/research/NaturalQuestions) | [paper](https://transacl.org/ojs/index.php/tacl/article/view/1455) | 100,231 |
171
- | [SQuAD2.0](https://rajpurkar.github.io/SQuAD-explorer/) | [paper](https://aclanthology.org/P18-2124.pdf) | 87,599 |
172
- | [TriviaQA](https://huggingface.co/datasets/trivia_qa) | - | 73,346 |
173
- | **Total** | | **1,170,060,424** |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/data_config.json DELETED
@@ -1,1452 +0,0 @@
1
- [
2
- {
3
- "name": "stackexchange_title_body/skeptics.stackexchange.com.jsonl.gz",
4
- "lines": 10009,
5
- "weight": 1
6
- },
7
- {
8
- "name": "stackexchange_TitleBody_Answer/islam.stackexchange.com.jsonl.gz",
9
- "lines": 10052,
10
- "weight": 1
11
- },
12
- {
13
- "name": "stackexchange_Title_Answer/islam.stackexchange.com.jsonl.gz",
14
- "lines": 10052,
15
- "weight": 1
16
- },
17
- {
18
- "name": "stackexchange_TitleBody_Answer/anime.stackexchange.com.jsonl.gz",
19
- "lines": 10131,
20
- "weight": 1
21
- },
22
- {
23
- "name": "stackexchange_Title_Answer/anime.stackexchange.com.jsonl.gz",
24
- "lines": 10131,
25
- "weight": 1
26
- },
27
- {
28
- "name": "stackexchange_title_body/writers.stackexchange.com.jsonl.gz",
29
- "lines": 10157,
30
- "weight": 1
31
- },
32
- {
33
- "name": "stackexchange_title_body/astronomy.stackexchange.com.jsonl.gz",
34
- "lines": 10462,
35
- "weight": 1
36
- },
37
- {
38
- "name": "stackexchange_title_body/vi.stackexchange.com.jsonl.gz",
39
- "lines": 10551,
40
- "weight": 1
41
- },
42
- {
43
- "name": "stackexchange_TitleBody_Answer/french.stackexchange.com.jsonl.gz",
44
- "lines": 10578,
45
- "weight": 1
46
- },
47
- {
48
- "name": "stackexchange_Title_Answer/french.stackexchange.com.jsonl.gz",
49
- "lines": 10578,
50
- "weight": 1
51
- },
52
- {
53
- "name": "stackexchange_title_body/cstheory.stackexchange.com.jsonl.gz",
54
- "lines": 10642,
55
- "weight": 1
56
- },
57
- {
58
- "name": "stackexchange_TitleBody_Answer/civicrm.stackexchange.com.jsonl.gz",
59
- "lines": 10648,
60
- "weight": 1
61
- },
62
- {
63
- "name": "stackexchange_Title_Answer/civicrm.stackexchange.com.jsonl.gz",
64
- "lines": 10648,
65
- "weight": 1
66
- },
67
- {
68
- "name": "stackexchange_TitleBody_Answer/expressionengine.stackexchange.com.jsonl.gz",
69
- "lines": 10742,
70
- "weight": 1
71
- },
72
- {
73
- "name": "stackexchange_Title_Answer/expressionengine.stackexchange.com.jsonl.gz",
74
- "lines": 10742,
75
- "weight": 1
76
- },
77
- {
78
- "name": "stackexchange_title_body/engineering.stackexchange.com.jsonl.gz",
79
- "lines": 10753,
80
- "weight": 1
81
- },
82
- {
83
- "name": "stackexchange_TitleBody_Answer/history.stackexchange.com.jsonl.gz",
84
- "lines": 10766,
85
- "weight": 1
86
- },
87
- {
88
- "name": "stackexchange_Title_Answer/history.stackexchange.com.jsonl.gz",
89
- "lines": 10766,
90
- "weight": 1
91
- },
92
- {
93
- "name": "stackexchange_title_body/french.stackexchange.com.jsonl.gz",
94
- "lines": 10794,
95
- "weight": 1
96
- },
97
- {
98
- "name": "stackexchange_TitleBody_Answer/politics.stackexchange.com.jsonl.gz",
99
- "lines": 11047,
100
- "weight": 1
101
- },
102
- {
103
- "name": "stackexchange_Title_Answer/politics.stackexchange.com.jsonl.gz",
104
- "lines": 11047,
105
- "weight": 1
106
- },
107
- {
108
- "name": "stackexchange_title_body/economics.stackexchange.com.jsonl.gz",
109
- "lines": 11115,
110
- "weight": 1
111
- },
112
- {
113
- "name": "stackexchange_TitleBody_Answer/craftcms.stackexchange.com.jsonl.gz",
114
- "lines": 11236,
115
- "weight": 1
116
- },
117
- {
118
- "name": "stackexchange_Title_Answer/craftcms.stackexchange.com.jsonl.gz",
119
- "lines": 11236,
120
- "weight": 1
121
- },
122
- {
123
- "name": "stackexchange_title_body/anime.stackexchange.com.jsonl.gz",
124
- "lines": 11444,
125
- "weight": 1
126
- },
127
- {
128
- "name": "stackexchange_TitleBody_Answer/christianity.stackexchange.com.jsonl.gz",
129
- "lines": 11498,
130
- "weight": 1
131
- },
132
- {
133
- "name": "stackexchange_Title_Answer/christianity.stackexchange.com.jsonl.gz",
134
- "lines": 11498,
135
- "weight": 1
136
- },
137
- {
138
- "name": "stackexchange_TitleBody_Answer/softwarerecs.stackexchange.com.jsonl.gz",
139
- "lines": 11761,
140
- "weight": 1
141
- },
142
- {
143
- "name": "stackexchange_Title_Answer/softwarerecs.stackexchange.com.jsonl.gz",
144
- "lines": 11761,
145
- "weight": 1
146
- },
147
- {
148
- "name": "stackexchange_TitleBody_Answer/boardgames.stackexchange.com.jsonl.gz",
149
- "lines": 11805,
150
- "weight": 1
151
- },
152
- {
153
- "name": "stackexchange_Title_Answer/boardgames.stackexchange.com.jsonl.gz",
154
- "lines": 11805,
155
- "weight": 1
156
- },
157
- {
158
- "name": "stackexchange_title_body/islam.stackexchange.com.jsonl.gz",
159
- "lines": 11853,
160
- "weight": 1
161
- },
162
- {
163
- "name": "stackexchange_title_body/expressionengine.stackexchange.com.jsonl.gz",
164
- "lines": 11866,
165
- "weight": 1
166
- },
167
- {
168
- "name": "stackexchange_title_body/politics.stackexchange.com.jsonl.gz",
169
- "lines": 11894,
170
- "weight": 1
171
- },
172
- {
173
- "name": "stackexchange_title_body/history.stackexchange.com.jsonl.gz",
174
- "lines": 12021,
175
- "weight": 1
176
- },
177
- {
178
- "name": "stackexchange_title_body/christianity.stackexchange.com.jsonl.gz",
179
- "lines": 12108,
180
- "weight": 1
181
- },
182
- {
183
- "name": "stackexchange_title_body/boardgames.stackexchange.com.jsonl.gz",
184
- "lines": 12149,
185
- "weight": 1
186
- },
187
- {
188
- "name": "flickr30k_captions.jsonl.gz",
189
- "lines": 317695,
190
- "weight": 1
191
- },
192
- {
193
- "name": "coco_captions.jsonl.gz",
194
- "lines": 828395,
195
- "weight": 1
196
- },
197
- {
198
- "name": "codesearchnet.jsonl.gz",
199
- "lines": 1151414,
200
- "weight": 1
201
- },
202
- {
203
- "name": "stackexchange_title_body/civicrm.stackexchange.com.jsonl.gz",
204
- "lines": 12543,
205
- "weight": 2
206
- },
207
- {
208
- "name": "stackexchange_title_body/craftcms.stackexchange.com.jsonl.gz",
209
- "lines": 12574,
210
- "weight": 2
211
- },
212
- {
213
- "name": "stackexchange_TitleBody_Answer/networkengineering.stackexchange.com.jsonl.gz",
214
- "lines": 12590,
215
- "weight": 2
216
- },
217
- {
218
- "name": "stackexchange_Title_Answer/networkengineering.stackexchange.com.jsonl.gz",
219
- "lines": 12590,
220
- "weight": 2
221
- },
222
- {
223
- "name": "stackexchange_TitleBody_Answer/space.stackexchange.com.jsonl.gz",
224
- "lines": 12893,
225
- "weight": 2
226
- },
227
- {
228
- "name": "stackexchange_Title_Answer/space.stackexchange.com.jsonl.gz",
229
- "lines": 12893,
230
- "weight": 2
231
- },
232
- {
233
- "name": "stackexchange_TitleBody_Answer/quant.stackexchange.com.jsonl.gz",
234
- "lines": 12933,
235
- "weight": 2
236
- },
237
- {
238
- "name": "stackexchange_Title_Answer/quant.stackexchange.com.jsonl.gz",
239
- "lines": 12933,
240
- "weight": 2
241
- },
242
- {
243
- "name": "stackexchange_TitleBody_Answer/philosophy.stackexchange.com.jsonl.gz",
244
- "lines": 13114,
245
- "weight": 2
246
- },
247
- {
248
- "name": "stackexchange_Title_Answer/philosophy.stackexchange.com.jsonl.gz",
249
- "lines": 13114,
250
- "weight": 2
251
- },
252
- {
253
- "name": "stackexchange_TitleBody_Answer/gardening.stackexchange.com.jsonl.gz",
254
- "lines": 13246,
255
- "weight": 2
256
- },
257
- {
258
- "name": "stackexchange_Title_Answer/gardening.stackexchange.com.jsonl.gz",
259
- "lines": 13246,
260
- "weight": 2
261
- },
262
- {
263
- "name": "stackexchange_title_body/hinduism.stackexchange.com.jsonl.gz",
264
- "lines": 13450,
265
- "weight": 2
266
- },
267
- {
268
- "name": "stackexchange_title_body/networkengineering.stackexchange.com.jsonl.gz",
269
- "lines": 13454,
270
- "weight": 2
271
- },
272
- {
273
- "name": "stackexchange_TitleBody_Answer/german.stackexchange.com.jsonl.gz",
274
- "lines": 13733,
275
- "weight": 2
276
- },
277
- {
278
- "name": "stackexchange_Title_Answer/german.stackexchange.com.jsonl.gz",
279
- "lines": 13733,
280
- "weight": 2
281
- },
282
- {
283
- "name": "stackexchange_title_body/german.stackexchange.com.jsonl.gz",
284
- "lines": 13950,
285
- "weight": 2
286
- },
287
- {
288
- "name": "stackexchange_title_body/philosophy.stackexchange.com.jsonl.gz",
289
- "lines": 14829,
290
- "weight": 2
291
- },
292
- {
293
- "name": "stackexchange_title_body/gardening.stackexchange.com.jsonl.gz",
294
- "lines": 15136,
295
- "weight": 2
296
- },
297
- {
298
- "name": "stackexchange_title_body/space.stackexchange.com.jsonl.gz",
299
- "lines": 15142,
300
- "weight": 2
301
- },
302
- {
303
- "name": "stackexchange_TitleBody_Answer/bicycles.stackexchange.com.jsonl.gz",
304
- "lines": 15708,
305
- "weight": 2
306
- },
307
- {
308
- "name": "stackexchange_Title_Answer/bicycles.stackexchange.com.jsonl.gz",
309
- "lines": 15708,
310
- "weight": 2
311
- },
312
- {
313
- "name": "stackexchange_TitleBody_Answer/law.stackexchange.com.jsonl.gz",
314
- "lines": 16133,
315
- "weight": 2
316
- },
317
- {
318
- "name": "stackexchange_Title_Answer/law.stackexchange.com.jsonl.gz",
319
- "lines": 16133,
320
- "weight": 2
321
- },
322
- {
323
- "name": "stackexchange_TitleBody_Answer/arduino.stackexchange.com.jsonl.gz",
324
- "lines": 16281,
325
- "weight": 2
326
- },
327
- {
328
- "name": "stackexchange_Title_Answer/arduino.stackexchange.com.jsonl.gz",
329
- "lines": 16281,
330
- "weight": 2
331
- },
332
- {
333
- "name": "stackexchange_title_body/bicycles.stackexchange.com.jsonl.gz",
334
- "lines": 16353,
335
- "weight": 2
336
- },
337
- {
338
- "name": "stackexchange_TitleBody_Answer/emacs.stackexchange.com.jsonl.gz",
339
- "lines": 16830,
340
- "weight": 2
341
- },
342
- {
343
- "name": "stackexchange_Title_Answer/emacs.stackexchange.com.jsonl.gz",
344
- "lines": 16830,
345
- "weight": 2
346
- },
347
- {
348
- "name": "stackexchange_title_body/quant.stackexchange.com.jsonl.gz",
349
- "lines": 17261,
350
- "weight": 2
351
- },
352
- {
353
- "name": "stackexchange_TitleBody_Answer/dsp.stackexchange.com.jsonl.gz",
354
- "lines": 17430,
355
- "weight": 2
356
- },
357
- {
358
- "name": "stackexchange_Title_Answer/dsp.stackexchange.com.jsonl.gz",
359
- "lines": 17430,
360
- "weight": 2
361
- },
362
- {
363
- "name": "stackexchange_TitleBody_Answer/puzzling.stackexchange.com.jsonl.gz",
364
- "lines": 17448,
365
- "weight": 2
366
- },
367
- {
368
- "name": "stackexchange_Title_Answer/puzzling.stackexchange.com.jsonl.gz",
369
- "lines": 17448,
370
- "weight": 2
371
- },
372
- {
373
- "name": "stackexchange_title_body/puzzling.stackexchange.com.jsonl.gz",
374
- "lines": 17851,
375
- "weight": 2
376
- },
377
- {
378
- "name": "stackexchange_title_body/law.stackexchange.com.jsonl.gz",
379
- "lines": 17941,
380
- "weight": 2
381
- },
382
- {
383
- "name": "stackexchange_TitleBody_Answer/movies.stackexchange.com.jsonl.gz",
384
- "lines": 18243,
385
- "weight": 2
386
- },
387
- {
388
- "name": "stackexchange_Title_Answer/movies.stackexchange.com.jsonl.gz",
389
- "lines": 18243,
390
- "weight": 2
391
- },
392
- {
393
- "name": "stackexchange_TitleBody_Answer/mechanics.stackexchange.com.jsonl.gz",
394
- "lines": 18613,
395
- "weight": 2
396
- },
397
- {
398
- "name": "stackexchange_Title_Answer/mechanics.stackexchange.com.jsonl.gz",
399
- "lines": 18613,
400
- "weight": 2
401
- },
402
- {
403
- "name": "stackexchange_TitleBody_Answer/aviation.stackexchange.com.jsonl.gz",
404
- "lines": 18755,
405
- "weight": 2
406
- },
407
- {
408
- "name": "stackexchange_Title_Answer/aviation.stackexchange.com.jsonl.gz",
409
- "lines": 18755,
410
- "weight": 2
411
- },
412
- {
413
- "name": "stackexchange_TitleBody_Answer/biology.stackexchange.com.jsonl.gz",
414
- "lines": 19277,
415
- "weight": 2
416
- },
417
- {
418
- "name": "stackexchange_Title_Answer/biology.stackexchange.com.jsonl.gz",
419
- "lines": 19277,
420
- "weight": 2
421
- },
422
- {
423
- "name": "stackexchange_TitleBody_Answer/crypto.stackexchange.com.jsonl.gz",
424
- "lines": 19404,
425
- "weight": 2
426
- },
427
- {
428
- "name": "stackexchange_Title_Answer/crypto.stackexchange.com.jsonl.gz",
429
- "lines": 19404,
430
- "weight": 2
431
- },
432
- {
433
- "name": "stackexchange_title_body/arduino.stackexchange.com.jsonl.gz",
434
- "lines": 19553,
435
- "weight": 2
436
- },
437
- {
438
- "name": "stackexchange_TitleBody_Answer/music.stackexchange.com.jsonl.gz",
439
- "lines": 19936,
440
- "weight": 2
441
- },
442
- {
443
- "name": "stackexchange_Title_Answer/music.stackexchange.com.jsonl.gz",
444
- "lines": 19936,
445
- "weight": 2
446
- },
447
- {
448
- "name": "stackexchange_title_body/aviation.stackexchange.com.jsonl.gz",
449
- "lines": 20139,
450
- "weight": 2
451
- },
452
- {
453
- "name": "stackexchange_title_body/softwarerecs.stackexchange.com.jsonl.gz",
454
- "lines": 20142,
455
- "weight": 2
456
- },
457
- {
458
- "name": "stackexchange_title_body/movies.stackexchange.com.jsonl.gz",
459
- "lines": 20181,
460
- "weight": 2
461
- },
462
- {
463
- "name": "stackexchange_TitleBody_Answer/datascience.stackexchange.com.jsonl.gz",
464
- "lines": 20503,
465
- "weight": 2
466
- },
467
- {
468
- "name": "stackexchange_Title_Answer/datascience.stackexchange.com.jsonl.gz",
469
- "lines": 20503,
470
- "weight": 2
471
- },
472
- {
473
- "name": "stackexchange_title_body/music.stackexchange.com.jsonl.gz",
474
- "lines": 20636,
475
- "weight": 2
476
- },
477
- {
478
- "name": "stackexchange_TitleBody_Answer/japanese.stackexchange.com.jsonl.gz",
479
- "lines": 20948,
480
- "weight": 2
481
- },
482
- {
483
- "name": "stackexchange_Title_Answer/japanese.stackexchange.com.jsonl.gz",
484
- "lines": 20948,
485
- "weight": 2
486
- },
487
- {
488
- "name": "stackexchange_title_body/emacs.stackexchange.com.jsonl.gz",
489
- "lines": 21055,
490
- "weight": 2
491
- },
492
- {
493
- "name": "stackexchange_title_body/dsp.stackexchange.com.jsonl.gz",
494
- "lines": 21252,
495
- "weight": 2
496
- },
497
- {
498
- "name": "stackexchange_title_body/japanese.stackexchange.com.jsonl.gz",
499
- "lines": 22056,
500
- "weight": 2
501
- },
502
- {
503
- "name": "stackexchange_TitleBody_Answer/bitcoin.stackexchange.com.jsonl.gz",
504
- "lines": 22474,
505
- "weight": 2
506
- },
507
- {
508
- "name": "stackexchange_Title_Answer/bitcoin.stackexchange.com.jsonl.gz",
509
- "lines": 22474,
510
- "weight": 2
511
- },
512
- {
513
- "name": "stackexchange_TitleBody_Answer/cooking.stackexchange.com.jsonl.gz",
514
- "lines": 22641,
515
- "weight": 2
516
- },
517
- {
518
- "name": "stackexchange_Title_Answer/cooking.stackexchange.com.jsonl.gz",
519
- "lines": 22641,
520
- "weight": 2
521
- },
522
- {
523
- "name": "stackexchange_title_body/mechanics.stackexchange.com.jsonl.gz",
524
- "lines": 22868,
525
- "weight": 2
526
- },
527
- {
528
- "name": "stackexchange_TitleBody_Answer/photo.stackexchange.com.jsonl.gz",
529
- "lines": 23204,
530
- "weight": 2
531
- },
532
- {
533
- "name": "stackexchange_Title_Answer/photo.stackexchange.com.jsonl.gz",
534
- "lines": 23204,
535
- "weight": 2
536
- },
537
- {
538
- "name": "stackexchange_title_body/crypto.stackexchange.com.jsonl.gz",
539
- "lines": 23231,
540
- "weight": 2
541
- },
542
- {
543
- "name": "stackexchange_title_body/cooking.stackexchange.com.jsonl.gz",
544
- "lines": 23705,
545
- "weight": 2
546
- },
547
- {
548
- "name": "stackexchange_title_body/photo.stackexchange.com.jsonl.gz",
549
- "lines": 23753,
550
- "weight": 2
551
- },
552
- {
553
- "name": "stackexchange_TitleBody_Answer/workplace.stackexchange.com.jsonl.gz",
554
- "lines": 24012,
555
- "weight": 2
556
- },
557
- {
558
- "name": "stackexchange_Title_Answer/workplace.stackexchange.com.jsonl.gz",
559
- "lines": 24012,
560
- "weight": 2
561
- },
562
- {
563
- "name": "stackexchange_TitleBody_Answer/meta.stackoverflow.com.jsonl.gz",
564
- "lines": 24044,
565
- "weight": 2
566
- },
567
- {
568
- "name": "stackexchange_Title_Answer/meta.stackoverflow.com.jsonl.gz",
569
- "lines": 24044,
570
- "weight": 2
571
- },
572
- {
573
- "name": "stackexchange_TitleBody_Answer/raspberrypi.stackexchange.com.jsonl.gz",
574
- "lines": 24143,
575
- "weight": 2
576
- },
577
- {
578
- "name": "stackexchange_Title_Answer/raspberrypi.stackexchange.com.jsonl.gz",
579
- "lines": 24143,
580
- "weight": 2
581
- },
582
- {
583
- "name": "stackexchange_title_body/workplace.stackexchange.com.jsonl.gz",
584
- "lines": 24189,
585
- "weight": 2
586
- },
587
- {
588
- "name": "stackexchange_title_body/biology.stackexchange.com.jsonl.gz",
589
- "lines": 24447,
590
- "weight": 3
591
- },
592
- {
593
- "name": "stackexchange_TitleBody_Answer/webapps.stackexchange.com.jsonl.gz",
594
- "lines": 24867,
595
- "weight": 3
596
- },
597
- {
598
- "name": "stackexchange_Title_Answer/webapps.stackexchange.com.jsonl.gz",
599
- "lines": 24867,
600
- "weight": 3
601
- },
602
- {
603
- "name": "stackexchange_title_body/bitcoin.stackexchange.com.jsonl.gz",
604
- "lines": 25374,
605
- "weight": 3
606
- },
607
- {
608
- "name": "stackexchange_TitleBody_Answer/judaism.stackexchange.com.jsonl.gz",
609
- "lines": 26085,
610
- "weight": 3
611
- },
612
- {
613
- "name": "stackexchange_Title_Answer/judaism.stackexchange.com.jsonl.gz",
614
- "lines": 26085,
615
- "weight": 3
616
- },
617
- {
618
- "name": "stackexchange_TitleBody_Answer/ethereum.stackexchange.com.jsonl.gz",
619
- "lines": 26124,
620
- "weight": 3
621
- },
622
- {
623
- "name": "stackexchange_Title_Answer/ethereum.stackexchange.com.jsonl.gz",
624
- "lines": 26124,
625
- "weight": 3
626
- },
627
- {
628
- "name": "stackexchange_TitleBody_Answer/worldbuilding.stackexchange.com.jsonl.gz",
629
- "lines": 26210,
630
- "weight": 3
631
- },
632
- {
633
- "name": "stackexchange_Title_Answer/worldbuilding.stackexchange.com.jsonl.gz",
634
- "lines": 26210,
635
- "weight": 3
636
- },
637
- {
638
- "name": "stackexchange_title_body/worldbuilding.stackexchange.com.jsonl.gz",
639
- "lines": 26763,
640
- "weight": 3
641
- },
642
- {
643
- "name": "stackexchange_TitleBody_Answer/chemistry.stackexchange.com.jsonl.gz",
644
- "lines": 27061,
645
- "weight": 3
646
- },
647
- {
648
- "name": "stackexchange_Title_Answer/chemistry.stackexchange.com.jsonl.gz",
649
- "lines": 27061,
650
- "weight": 3
651
- },
652
- {
653
- "name": "stackexchange_title_body/datascience.stackexchange.com.jsonl.gz",
654
- "lines": 27397,
655
- "weight": 3
656
- },
657
- {
658
- "name": "stackexchange_TitleBody_Answer/graphicdesign.stackexchange.com.jsonl.gz",
659
- "lines": 28083,
660
- "weight": 3
661
- },
662
- {
663
- "name": "stackexchange_Title_Answer/graphicdesign.stackexchange.com.jsonl.gz",
664
- "lines": 28083,
665
- "weight": 3
666
- },
667
- {
668
- "name": "stackexchange_TitleBody_Answer/ux.stackexchange.com.jsonl.gz",
669
- "lines": 28901,
670
- "weight": 3
671
- },
672
- {
673
- "name": "stackexchange_Title_Answer/ux.stackexchange.com.jsonl.gz",
674
- "lines": 28901,
675
- "weight": 3
676
- },
677
- {
678
- "name": "stackexchange_title_body/ux.stackexchange.com.jsonl.gz",
679
- "lines": 29403,
680
- "weight": 3
681
- },
682
- {
683
- "name": "stackexchange_TitleBody_Answer/money.stackexchange.com.jsonl.gz",
684
- "lines": 29404,
685
- "weight": 3
686
- },
687
- {
688
- "name": "stackexchange_Title_Answer/money.stackexchange.com.jsonl.gz",
689
- "lines": 29404,
690
- "weight": 3
691
- },
692
- {
693
- "name": "stackexchange_title_body/webapps.stackexchange.com.jsonl.gz",
694
- "lines": 29697,
695
- "weight": 3
696
- },
697
- {
698
- "name": "stackexchange_TitleBody_Answer/cs.stackexchange.com.jsonl.gz",
699
- "lines": 30010,
700
- "weight": 3
701
- },
702
- {
703
- "name": "stackexchange_Title_Answer/cs.stackexchange.com.jsonl.gz",
704
- "lines": 30010,
705
- "weight": 3
706
- },
707
- {
708
- "name": "stackexchange_title_body/graphicdesign.stackexchange.com.jsonl.gz",
709
- "lines": 30233,
710
- "weight": 3
711
- },
712
- {
713
- "name": "stackexchange_TitleBody_Answer/webmasters.stackexchange.com.jsonl.gz",
714
- "lines": 30370,
715
- "weight": 3
716
- },
717
- {
718
- "name": "stackexchange_Title_Answer/webmasters.stackexchange.com.jsonl.gz",
719
- "lines": 30370,
720
- "weight": 3
721
- },
722
- {
723
- "name": "stackexchange_title_body/raspberrypi.stackexchange.com.jsonl.gz",
724
- "lines": 30625,
725
- "weight": 3
726
- },
727
- {
728
- "name": "stackexchange_title_body/money.stackexchange.com.jsonl.gz",
729
- "lines": 32021,
730
- "weight": 3
731
- },
732
- {
733
- "name": "stackexchange_title_body/judaism.stackexchange.com.jsonl.gz",
734
- "lines": 32028,
735
- "weight": 3
736
- },
737
- {
738
- "name": "stackexchange_TitleBody_Answer/academia.stackexchange.com.jsonl.gz",
739
- "lines": 32137,
740
- "weight": 3
741
- },
742
- {
743
- "name": "stackexchange_Title_Answer/academia.stackexchange.com.jsonl.gz",
744
- "lines": 32137,
745
- "weight": 3
746
- },
747
- {
748
- "name": "stackexchange_title_body/ethereum.stackexchange.com.jsonl.gz",
749
- "lines": 32760,
750
- "weight": 3
751
- },
752
- {
753
- "name": "stackexchange_title_body/academia.stackexchange.com.jsonl.gz",
754
- "lines": 34331,
755
- "weight": 3
756
- },
757
- {
758
- "name": "stackexchange_title_body/chemistry.stackexchange.com.jsonl.gz",
759
- "lines": 34506,
760
- "weight": 3
761
- },
762
- {
763
- "name": "stackexchange_title_body/webmasters.stackexchange.com.jsonl.gz",
764
- "lines": 34559,
765
- "weight": 3
766
- },
767
- {
768
- "name": "stackexchange_title_body/meta.stackoverflow.com.jsonl.gz",
769
- "lines": 36456,
770
- "weight": 3
771
- },
772
- {
773
- "name": "stackexchange_TitleBody_Answer/travel.stackexchange.com.jsonl.gz",
774
- "lines": 36533,
775
- "weight": 4
776
- },
777
- {
778
- "name": "stackexchange_Title_Answer/travel.stackexchange.com.jsonl.gz",
779
- "lines": 36533,
780
- "weight": 4
781
- },
782
- {
783
- "name": "stackexchange_TitleBody_Answer/android.stackexchange.com.jsonl.gz",
784
- "lines": 38077,
785
- "weight": 4
786
- },
787
- {
788
- "name": "stackexchange_Title_Answer/android.stackexchange.com.jsonl.gz",
789
- "lines": 38077,
790
- "weight": 4
791
- },
792
- {
793
- "name": "stackexchange_title_body/cs.stackexchange.com.jsonl.gz",
794
- "lines": 38314,
795
- "weight": 4
796
- },
797
- {
798
- "name": "stackexchange_TitleBody_Answer/gamedev.stackexchange.com.jsonl.gz",
799
- "lines": 40154,
800
- "weight": 4
801
- },
802
- {
803
- "name": "stackexchange_Title_Answer/gamedev.stackexchange.com.jsonl.gz",
804
- "lines": 40154,
805
- "weight": 4
806
- },
807
- {
808
- "name": "stackexchange_TitleBody_Answer/rpg.stackexchange.com.jsonl.gz",
809
- "lines": 40435,
810
- "weight": 4
811
- },
812
- {
813
- "name": "stackexchange_Title_Answer/rpg.stackexchange.com.jsonl.gz",
814
- "lines": 40435,
815
- "weight": 4
816
- },
817
- {
818
- "name": "stackexchange_title_body/travel.stackexchange.com.jsonl.gz",
819
- "lines": 41227,
820
- "weight": 4
821
- },
822
- {
823
- "name": "stackexchange_TitleBody_Answer/codereview.stackexchange.com.jsonl.gz",
824
- "lines": 41748,
825
- "weight": 4
826
- },
827
- {
828
- "name": "stackexchange_Title_Answer/codereview.stackexchange.com.jsonl.gz",
829
- "lines": 41748,
830
- "weight": 4
831
- },
832
- {
833
- "name": "stackexchange_title_body/rpg.stackexchange.com.jsonl.gz",
834
- "lines": 42303,
835
- "weight": 4
836
- },
837
- {
838
- "name": "stackexchange_title_body/codereview.stackexchange.com.jsonl.gz",
839
- "lines": 45765,
840
- "weight": 4
841
- },
842
- {
843
- "name": "stackexchange_title_body/gamedev.stackexchange.com.jsonl.gz",
844
- "lines": 46485,
845
- "weight": 4
846
- },
847
- {
848
- "name": "stackexchange_TitleBody_Answer/softwareengineering.stackexchange.com.jsonl.gz",
849
- "lines": 51326,
850
- "weight": 5
851
- },
852
- {
853
- "name": "stackexchange_Title_Answer/softwareengineering.stackexchange.com.jsonl.gz",
854
- "lines": 51326,
855
- "weight": 5
856
- },
857
- {
858
- "name": "stackexchange_TitleBody_Answer/security.stackexchange.com.jsonl.gz",
859
- "lines": 51355,
860
- "weight": 5
861
- },
862
- {
863
- "name": "stackexchange_Title_Answer/security.stackexchange.com.jsonl.gz",
864
- "lines": 51355,
865
- "weight": 5
866
- },
867
- {
868
- "name": "stackexchange_title_body/android.stackexchange.com.jsonl.gz",
869
- "lines": 51608,
870
- "weight": 5
871
- },
872
- {
873
- "name": "stackexchange_TitleBody_Answer/diy.stackexchange.com.jsonl.gz",
874
- "lines": 52896,
875
- "weight": 5
876
- },
877
- {
878
- "name": "stackexchange_Title_Answer/diy.stackexchange.com.jsonl.gz",
879
- "lines": 52896,
880
- "weight": 5
881
- },
882
- {
883
- "name": "stackexchange_title_body/softwareengineering.stackexchange.com.jsonl.gz",
884
- "lines": 53942,
885
- "weight": 5
886
- },
887
- {
888
- "name": "stackexchange_TitleBody_Answer/blender.stackexchange.com.jsonl.gz",
889
- "lines": 54153,
890
- "weight": 5
891
- },
892
- {
893
- "name": "stackexchange_Title_Answer/blender.stackexchange.com.jsonl.gz",
894
- "lines": 54153,
895
- "weight": 5
896
- },
897
- {
898
- "name": "stackexchange_TitleBody_Answer/scifi.stackexchange.com.jsonl.gz",
899
- "lines": 54805,
900
- "weight": 5
901
- },
902
- {
903
- "name": "stackexchange_Title_Answer/scifi.stackexchange.com.jsonl.gz",
904
- "lines": 54805,
905
- "weight": 5
906
- },
907
- {
908
- "name": "stackexchange_title_body/security.stackexchange.com.jsonl.gz",
909
- "lines": 58000,
910
- "weight": 5
911
- },
912
- {
913
- "name": "stackexchange_TitleBody_Answer/mathematica.stackexchange.com.jsonl.gz",
914
- "lines": 59895,
915
- "weight": 5
916
- },
917
- {
918
- "name": "stackexchange_Title_Answer/mathematica.stackexchange.com.jsonl.gz",
919
- "lines": 59895,
920
- "weight": 5
921
- },
922
- {
923
- "name": "stackexchange_title_body/diy.stackexchange.com.jsonl.gz",
924
- "lines": 60083,
925
- "weight": 5
926
- },
927
- {
928
- "name": "stackexchange_TitleBody_Answer/meta.stackexchange.com.jsonl.gz",
929
- "lines": 60744,
930
- "weight": 5
931
- },
932
- {
933
- "name": "stackexchange_Title_Answer/meta.stackexchange.com.jsonl.gz",
934
- "lines": 60744,
935
- "weight": 5
936
- },
937
- {
938
- "name": "stackexchange_title_body/scifi.stackexchange.com.jsonl.gz",
939
- "lines": 61528,
940
- "weight": 6
941
- },
942
- {
943
- "name": "stackexchange_TitleBody_Answer/drupal.stackexchange.com.jsonl.gz",
944
- "lines": 67817,
945
- "weight": 6
946
- },
947
- {
948
- "name": "stackexchange_Title_Answer/drupal.stackexchange.com.jsonl.gz",
949
- "lines": 67817,
950
- "weight": 6
951
- },
952
- {
953
- "name": "stackexchange_TitleBody_Answer/dba.stackexchange.com.jsonl.gz",
954
- "lines": 71449,
955
- "weight": 6
956
- },
957
- {
958
- "name": "stackexchange_Title_Answer/dba.stackexchange.com.jsonl.gz",
959
- "lines": 71449,
960
- "weight": 6
961
- },
962
- {
963
- "name": "stackexchange_title_body/mathematica.stackexchange.com.jsonl.gz",
964
- "lines": 73131,
965
- "weight": 7
966
- },
967
- {
968
- "name": "stackexchange_TitleBody_Answer/ell.stackexchange.com.jsonl.gz",
969
- "lines": 77892,
970
- "weight": 7
971
- },
972
- {
973
- "name": "stackexchange_Title_Answer/ell.stackexchange.com.jsonl.gz",
974
- "lines": 77892,
975
- "weight": 7
976
- },
977
- {
978
- "name": "stackexchange_TitleBody_Answer/magento.stackexchange.com.jsonl.gz",
979
- "lines": 79241,
980
- "weight": 7
981
- },
982
- {
983
- "name": "stackexchange_Title_Answer/magento.stackexchange.com.jsonl.gz",
984
- "lines": 79241,
985
- "weight": 7
986
- },
987
- {
988
- "name": "stackexchange_title_body/drupal.stackexchange.com.jsonl.gz",
989
- "lines": 79717,
990
- "weight": 7
991
- },
992
- {
993
- "name": "stackexchange_TitleBody_Answer/sharepoint.stackexchange.com.jsonl.gz",
994
- "lines": 80420,
995
- "weight": 7
996
- },
997
- {
998
- "name": "stackexchange_Title_Answer/sharepoint.stackexchange.com.jsonl.gz",
999
- "lines": 80420,
1000
- "weight": 7
1001
- },
1002
- {
1003
- "name": "stackexchange_title_body/blender.stackexchange.com.jsonl.gz",
1004
- "lines": 80766,
1005
- "weight": 7
1006
- },
1007
- {
1008
- "name": "stackexchange_title_body/dba.stackexchange.com.jsonl.gz",
1009
- "lines": 81871,
1010
- "weight": 7
1011
- },
1012
- {
1013
- "name": "stackexchange_TitleBody_Answer/gaming.stackexchange.com.jsonl.gz",
1014
- "lines": 82887,
1015
- "weight": 7
1016
- },
1017
- {
1018
- "name": "stackexchange_Title_Answer/gaming.stackexchange.com.jsonl.gz",
1019
- "lines": 82887,
1020
- "weight": 7
1021
- },
1022
- {
1023
- "name": "stackexchange_title_body/ell.stackexchange.com.jsonl.gz",
1024
- "lines": 83271,
1025
- "weight": 7
1026
- },
1027
- {
1028
- "name": "stackexchange_title_body/meta.stackexchange.com.jsonl.gz",
1029
- "lines": 83510,
1030
- "weight": 7
1031
- },
1032
- {
1033
- "name": "stackexchange_TitleBody_Answer/wordpress.stackexchange.com.jsonl.gz",
1034
- "lines": 83621,
1035
- "weight": 7
1036
- },
1037
- {
1038
- "name": "stackexchange_Title_Answer/wordpress.stackexchange.com.jsonl.gz",
1039
- "lines": 83621,
1040
- "weight": 7
1041
- },
1042
- {
1043
- "name": "stackexchange_TitleBody_Answer/mathoverflow.net.jsonl.gz",
1044
- "lines": 85289,
1045
- "weight": 8
1046
- },
1047
- {
1048
- "name": "stackexchange_Title_Answer/mathoverflow.net.jsonl.gz",
1049
- "lines": 85289,
1050
- "weight": 8
1051
- },
1052
- {
1053
- "name": "stackexchange_TitleBody_Answer/salesforce.stackexchange.com.jsonl.gz",
1054
- "lines": 87272,
1055
- "weight": 8
1056
- },
1057
- {
1058
- "name": "stackexchange_Title_Answer/salesforce.stackexchange.com.jsonl.gz",
1059
- "lines": 87272,
1060
- "weight": 8
1061
- },
1062
- {
1063
- "name": "stackexchange_title_body/gaming.stackexchange.com.jsonl.gz",
1064
- "lines": 88912,
1065
- "weight": 8
1066
- },
1067
- {
1068
- "name": "stackexchange_TitleBody_Answer/apple.stackexchange.com.jsonl.gz",
1069
- "lines": 92487,
1070
- "weight": 8
1071
- },
1072
- {
1073
- "name": "stackexchange_Title_Answer/apple.stackexchange.com.jsonl.gz",
1074
- "lines": 92487,
1075
- "weight": 8
1076
- },
1077
- {
1078
- "name": "stackexchange_title_body/sharepoint.stackexchange.com.jsonl.gz",
1079
- "lines": 94011,
1080
- "weight": 8
1081
- },
1082
- {
1083
- "name": "stackexchange_title_body/magento.stackexchange.com.jsonl.gz",
1084
- "lines": 99991,
1085
- "weight": 9
1086
- },
1087
- {
1088
- "name": "stackexchange_TitleBody_Answer/gis.stackexchange.com.jsonl.gz",
1089
- "lines": 100254,
1090
- "weight": 9
1091
- },
1092
- {
1093
- "name": "stackexchange_Title_Answer/gis.stackexchange.com.jsonl.gz",
1094
- "lines": 100254,
1095
- "weight": 9
1096
- },
1097
- {
1098
- "name": "stackexchange_title_body/wordpress.stackexchange.com.jsonl.gz",
1099
- "lines": 100474,
1100
- "weight": 9
1101
- },
1102
- {
1103
- "name": "stackexchange_TitleBody_Answer/english.stackexchange.com.jsonl.gz",
1104
- "lines": 100640,
1105
- "weight": 9
1106
- },
1107
- {
1108
- "name": "stackexchange_Title_Answer/english.stackexchange.com.jsonl.gz",
1109
- "lines": 100640,
1110
- "weight": 9
1111
- },
1112
- {
1113
- "name": "stackexchange_title_body/salesforce.stackexchange.com.jsonl.gz",
1114
- "lines": 105260,
1115
- "weight": 9
1116
- },
1117
- {
1118
- "name": "stackexchange_title_body/english.stackexchange.com.jsonl.gz",
1119
- "lines": 109522,
1120
- "weight": 10
1121
- },
1122
- {
1123
- "name": "stackexchange_title_body/apple.stackexchange.com.jsonl.gz",
1124
- "lines": 110622,
1125
- "weight": 10
1126
- },
1127
- {
1128
- "name": "stackexchange_TitleBody_Answer/stats.stackexchange.com.jsonl.gz",
1129
- "lines": 115679,
1130
- "weight": 10
1131
- },
1132
- {
1133
- "name": "stackexchange_Title_Answer/stats.stackexchange.com.jsonl.gz",
1134
- "lines": 115679,
1135
- "weight": 10
1136
- },
1137
- {
1138
- "name": "stackexchange_title_body/mathoverflow.net.jsonl.gz",
1139
- "lines": 120851,
1140
- "weight": 10
1141
- },
1142
- {
1143
- "name": "stackexchange_TitleBody_Answer/electronics.stackexchange.com.jsonl.gz",
1144
- "lines": 129494,
1145
- "weight": 11
1146
- },
1147
- {
1148
- "name": "stackexchange_Title_Answer/electronics.stackexchange.com.jsonl.gz",
1149
- "lines": 129494,
1150
- "weight": 11
1151
- },
1152
- {
1153
- "name": "stackexchange_title_body/gis.stackexchange.com.jsonl.gz",
1154
- "lines": 131000,
1155
- "weight": 11
1156
- },
1157
- {
1158
- "name": "stackexchange_TitleBody_Answer/physics.stackexchange.com.jsonl.gz",
1159
- "lines": 141230,
1160
- "weight": 12
1161
- },
1162
- {
1163
- "name": "stackexchange_Title_Answer/physics.stackexchange.com.jsonl.gz",
1164
- "lines": 141230,
1165
- "weight": 12
1166
- },
1167
- {
1168
- "name": "stackexchange_title_body/electronics.stackexchange.com.jsonl.gz",
1169
- "lines": 143582,
1170
- "weight": 12
1171
- },
1172
- {
1173
- "name": "stackexchange_TitleBody_Answer/unix.stackexchange.com.jsonl.gz",
1174
- "lines": 155414,
1175
- "weight": 13
1176
- },
1177
- {
1178
- "name": "stackexchange_Title_Answer/unix.stackexchange.com.jsonl.gz",
1179
- "lines": 155414,
1180
- "weight": 13
1181
- },
1182
- {
1183
- "name": "stackexchange_TitleBody_Answer/tex.stackexchange.com.jsonl.gz",
1184
- "lines": 171628,
1185
- "weight": 15
1186
- },
1187
- {
1188
- "name": "stackexchange_Title_Answer/tex.stackexchange.com.jsonl.gz",
1189
- "lines": 171628,
1190
- "weight": 15
1191
- },
1192
- {
1193
- "name": "stackexchange_title_body/physics.stackexchange.com.jsonl.gz",
1194
- "lines": 173307,
1195
- "weight": 15
1196
- },
1197
- {
1198
- "name": "stackexchange_title_body/stats.stackexchange.com.jsonl.gz",
1199
- "lines": 173466,
1200
- "weight": 15
1201
- },
1202
- {
1203
- "name": "stackexchange_title_body/unix.stackexchange.com.jsonl.gz",
1204
- "lines": 185997,
1205
- "weight": 16
1206
- },
1207
- {
1208
- "name": "stackexchange_title_body/tex.stackexchange.com.jsonl.gz",
1209
- "lines": 202954,
1210
- "weight": 17
1211
- },
1212
- {
1213
- "name": "TriviaQA_pairs.jsonl.gz",
1214
- "lines": 73346,
1215
- "weight": 19
1216
- },
1217
- {
1218
- "name": "stackexchange_TitleBody_Answer/serverfault.com.jsonl.gz",
1219
- "lines": 238507,
1220
- "weight": 20
1221
- },
1222
- {
1223
- "name": "stackexchange_Title_Answer/serverfault.com.jsonl.gz",
1224
- "lines": 238507,
1225
- "weight": 20
1226
- },
1227
- {
1228
- "name": "stackexchange_duplicate_questions_title-body_title-body.jsonl.gz",
1229
- "lines": 250460,
1230
- "weight": 21
1231
- },
1232
- {
1233
- "name": "stackexchange_duplicate_questions_body_body.jsonl.gz",
1234
- "lines": 250519,
1235
- "weight": 21
1236
- },
1237
- {
1238
- "name": "squad_pairs.jsonl.gz",
1239
- "lines": 87599,
1240
- "weight": 22
1241
- },
1242
- {
1243
- "name": "stackexchange_TitleBody_Answer/askubuntu.com.jsonl.gz",
1244
- "lines": 267135,
1245
- "weight": 22
1246
- },
1247
- {
1248
- "name": "stackexchange_Title_Answer/askubuntu.com.jsonl.gz",
1249
- "lines": 267135,
1250
- "weight": 22
1251
- },
1252
- {
1253
- "name": "stackexchange_title_body/serverfault.com.jsonl.gz",
1254
- "lines": 270904,
1255
- "weight": 23
1256
- },
1257
- {
1258
- "name": "NQ-train_pairs.jsonl.gz",
1259
- "lines": 100231,
1260
- "weight": 25
1261
- },
1262
- {
1263
- "name": "SimpleWiki.jsonl.gz",
1264
- "lines": 102225,
1265
- "weight": 26
1266
- },
1267
- {
1268
- "name": "quora_duplicates_triplets.jsonl.gz",
1269
- "lines": 103663,
1270
- "weight": 26
1271
- },
1272
- {
1273
- "name": "stackexchange_duplicate_questions_title_title.jsonl.gz",
1274
- "lines": 304525,
1275
- "weight": 26
1276
- },
1277
- {
1278
- "name": "altlex.jsonl.gz",
1279
- "lines": 112696,
1280
- "weight": 28
1281
- },
1282
- {
1283
- "name": "stackexchange_title_body/askubuntu.com.jsonl.gz",
1284
- "lines": 347925,
1285
- "weight": 29
1286
- },
1287
- {
1288
- "name": "stackexchange_TitleBody_Answer/superuser.com.jsonl.gz",
1289
- "lines": 352610,
1290
- "weight": 30
1291
- },
1292
- {
1293
- "name": "stackexchange_Title_Answer/superuser.com.jsonl.gz",
1294
- "lines": 352610,
1295
- "weight": 30
1296
- },
1297
- {
1298
- "name": "wikihow.jsonl.gz",
1299
- "lines": 128542,
1300
- "weight": 32
1301
- },
1302
- {
1303
- "name": "stackexchange_title_body/superuser.com.jsonl.gz",
1304
- "lines": 435463,
1305
- "weight": 36
1306
- },
1307
- {
1308
- "name": "stackexchange_title_body/small_stackexchanges.jsonl.gz",
1309
- "lines": 448146,
1310
- "weight": 37
1311
- },
1312
- {
1313
- "name": "stackexchange_TitleBody_Answer/small_stackexchanges.jsonl.gz",
1314
- "lines": 460256,
1315
- "weight": 38
1316
- },
1317
- {
1318
- "name": "stackexchange_Title_Answer/small_stackexchanges.jsonl.gz",
1319
- "lines": 460256,
1320
- "weight": 38
1321
- },
1322
- {
1323
- "name": "sentence-compression.jsonl.gz",
1324
- "lines": 180000,
1325
- "weight": 45
1326
- },
1327
- {
1328
- "name": "AllNLI.jsonl.gz",
1329
- "lines": 277230,
1330
- "weight": 69
1331
- },
1332
- {
1333
- "name": "eli5_question_answer.jsonl.gz",
1334
- "lines": 325475,
1335
- "weight": 81
1336
- },
1337
- {
1338
- "name": "reddit/reddit_2015.jsonl.gz",
1339
- "lines": 135108166,
1340
- "weight": 82
1341
- },
1342
- {
1343
- "name": "reddit/reddit_2016.jsonl.gz",
1344
- "lines": 159164386,
1345
- "weight": 82
1346
- },
1347
- {
1348
- "name": "reddit/reddit_2017.jsonl.gz",
1349
- "lines": 191485219,
1350
- "weight": 82
1351
- },
1352
- {
1353
- "name": "reddit/reddit_2018.jsonl.gz",
1354
- "lines": 240726659,
1355
- "weight": 82
1356
- },
1357
- {
1358
- "name": "stackexchange_TitleBody_Answer/math.stackexchange.com.jsonl.gz",
1359
- "lines": 1100953,
1360
- "weight": 83
1361
- },
1362
- {
1363
- "name": "stackexchange_Title_Answer/math.stackexchange.com.jsonl.gz",
1364
- "lines": 1100953,
1365
- "weight": 83
1366
- },
1367
- {
1368
- "name": "stackexchange_title_body/math.stackexchange.com.jsonl.gz",
1369
- "lines": 1338443,
1370
- "weight": 83
1371
- },
1372
- {
1373
- "name": "stackexchange_TitleBody_Answer/stackoverflow.com-Posts.jsonl.gz",
1374
- "lines": 15768211,
1375
- "weight": 83
1376
- },
1377
- {
1378
- "name": "stackexchange_Title_Answer/stackoverflow.com-Posts.jsonl.gz",
1379
- "lines": 15768211,
1380
- "weight": 83
1381
- },
1382
- {
1383
- "name": "stackexchange_title_body/stackoverflow.com-Posts.jsonl.gz",
1384
- "lines": 18562443,
1385
- "weight": 83
1386
- },
1387
- {
1388
- "name": "specter_train_triples.jsonl.gz",
1389
- "lines": 684100,
1390
- "weight": 84
1391
- },
1392
- {
1393
- "name": "S2ORC_title_abstract.jsonl.gz",
1394
- "lines": 41769185,
1395
- "weight": 123
1396
- },
1397
- {
1398
- "name": "S2ORC_citation_pairs.jsonl.gz",
1399
- "lines": 52603982,
1400
- "weight": 123
1401
- },
1402
- {
1403
- "name": "PAQ_pairs.jsonl.gz",
1404
- "lines": 64371441,
1405
- "weight": 123
1406
- },
1407
- {
1408
- "name": "WikiAnswers_pairs.jsonl.gz",
1409
- "lines": 77427422,
1410
- "weight": 123
1411
- },
1412
- {
1413
- "name": "S2ORC_citation_pairs_abstract.jsonl.gz",
1414
- "lines": 116288806,
1415
- "weight": 123
1416
- },
1417
- {
1418
- "name": "searchQA_question_top5_snippets_merged.jsonl.gz",
1419
- "lines": 582261,
1420
- "weight": 144
1421
- },
1422
- {
1423
- "name": "yahoo_answers_title_question.jsonl.gz",
1424
- "lines": 659896,
1425
- "weight": 163
1426
- },
1427
- {
1428
- "name": "yahoo_answers_question_answer.jsonl.gz",
1429
- "lines": 681164,
1430
- "weight": 169
1431
- },
1432
- {
1433
- "name": "yahoo_answers_title_answer.jsonl.gz",
1434
- "lines": 1198260,
1435
- "weight": 247
1436
- },
1437
- {
1438
- "name": "amazon-qa-train-pairs.jsonl.gz",
1439
- "lines": 2448839,
1440
- "weight": 247
1441
- },
1442
- {
1443
- "name": "gooaq_pairs.jsonl.gz",
1444
- "lines": 3012496,
1445
- "weight": 247
1446
- },
1447
- {
1448
- "name": "msmarco-query_passage_negative.jsonl.gz",
1449
- "lines": 9144553,
1450
- "weight": 247
1451
- }
1452
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db
3
- size 90868376
 
 
 
 
app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/onnx/model.onnx DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6fd5d72fe4589f189f8ebc006442dbb529bb7ce38f8082112682524616046452
3
- size 90405214
 
 
 
 
app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/onnx/model_O1.onnx DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:1391c6fc20b5530250bc15cbe1f47578ffeca55ab0551d335cc668b6299a88ec
3
- size 90360328
 
 
 
 
app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/onnx/model_O2.onnx DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:1de3905029190b398c7d300b530e320cf4b5e7d3dfb9af1429ebd73fd9a16faf
3
- size 90326566
 
 
 
 
app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/onnx/model_O3.onnx DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a44f671e364dddbac31f203f07b91be6b0a35e51936e5ebfab65b6d9538b83ff
3
- size 90326497
 
 
 
 
app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/onnx/model_O4.onnx DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:1667d7f3ba669048b13a96ee3a44456d5e42c8f44588ae8b603430e16160c485
3
- size 45212349
 
 
 
 
app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/onnx/model_qint8_arm64.onnx DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:4278337fd0ff3c68bfb6291042cad8ab363e1d9fbc43dcb499fe91c871902474
3
- size 23026053
 
 
 
 
app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/onnx/model_qint8_avx512.onnx DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:4278337fd0ff3c68bfb6291042cad8ab363e1d9fbc43dcb499fe91c871902474
3
- size 23026053
 
 
 
 
app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/onnx/model_qint8_avx512_vnni.onnx DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:4278337fd0ff3c68bfb6291042cad8ab363e1d9fbc43dcb499fe91c871902474
3
- size 23026053
 
 
 
 
app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/onnx/model_quint8_avx2.onnx DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b941bf19f1f1283680f449fa6a7336bb5600bdcd5f84d10ddc5cd72218a0fd21
3
- size 23046789
 
 
 
 
app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/openvino/openvino_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:8b86cab4722e2aefab310cf96d4d5a9eb3b187f7d9670a082afc55c7fa0d392a
3
- size 90265744
 
 
 
 
app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/openvino/openvino_model.xml DELETED
The diff for this file is too large to render. See raw diff
 
app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/openvino/openvino_model_qint8_quantized.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c92ea4af3c6bc7b4a0f3b3d61b147c850f4dbdd7c9e7beee0c0c70dc12da289b
3
- size 22933664
 
 
 
 
app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/openvino/openvino_model_qint8_quantized.xml DELETED
The diff for this file is too large to render. See raw diff
 
app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/pytorch_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c3a85f238711653950f6a79ece63eb0ea93d76f6a6284be04019c53733baf256
3
- size 90888945
 
 
 
 
app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/tokenizer.json DELETED
The diff for this file is too large to render. See raw diff
 
app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/train_script.py DELETED
@@ -1,344 +0,0 @@
1
- """
2
- Train script for a single file
3
-
4
- Need to set the TPU address first:
5
- export XRT_TPU_CONFIG="localservice;0;localhost:51011"
6
- """
7
-
8
- import torch.multiprocessing as mp
9
- import threading
10
- import time
11
- import random
12
- import sys
13
- import argparse
14
- import gzip
15
- import json
16
- import logging
17
- import tqdm
18
- import torch
19
- from torch import nn
20
- from torch.utils.data import DataLoader
21
- import torch
22
- import torch_xla
23
- import torch_xla.core
24
- import torch_xla.core.functions
25
- import torch_xla.core.xla_model as xm
26
- import torch_xla.distributed.xla_multiprocessing as xmp
27
- import torch_xla.distributed.parallel_loader as pl
28
- import os
29
- from shutil import copyfile
30
-
31
-
32
- from transformers import (
33
- AdamW,
34
- AutoModel,
35
- AutoTokenizer,
36
- get_linear_schedule_with_warmup,
37
- set_seed,
38
- )
39
-
40
- class AutoModelForSentenceEmbedding(nn.Module):
41
- def __init__(self, model_name, tokenizer, normalize=True):
42
- super(AutoModelForSentenceEmbedding, self).__init__()
43
-
44
- self.model = AutoModel.from_pretrained(model_name)
45
- self.normalize = normalize
46
- self.tokenizer = tokenizer
47
-
48
- def forward(self, **kwargs):
49
- model_output = self.model(**kwargs)
50
- embeddings = self.mean_pooling(model_output, kwargs['attention_mask'])
51
- if self.normalize:
52
- embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
53
-
54
- return embeddings
55
-
56
- def mean_pooling(self, model_output, attention_mask):
57
- token_embeddings = model_output[0] # First element of model_output contains all token embeddings
58
- input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
59
- return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
60
-
61
- def save_pretrained(self, output_path):
62
- if xm.is_master_ordinal():
63
- self.tokenizer.save_pretrained(output_path)
64
- self.model.config.save_pretrained(output_path)
65
-
66
- xm.save(self.model.state_dict(), os.path.join(output_path, "pytorch_model.bin"))
67
-
68
-
69
-
70
-
71
- def train_function(index, args, queue):
72
- tokenizer = AutoTokenizer.from_pretrained(args.model)
73
- model = AutoModelForSentenceEmbedding(args.model, tokenizer)
74
-
75
-
76
- ### Train Loop
77
- device = xm.xla_device()
78
- model = model.to(device)
79
-
80
- # Instantiate optimizer
81
- optimizer = AdamW(params=model.parameters(), lr=2e-5, correct_bias=True)
82
-
83
- lr_scheduler = get_linear_schedule_with_warmup(
84
- optimizer=optimizer,
85
- num_warmup_steps=500,
86
- num_training_steps=args.steps,
87
- )
88
-
89
- # Now we train the model
90
- cross_entropy_loss = nn.CrossEntropyLoss()
91
- max_grad_norm = 1
92
-
93
- model.train()
94
-
95
- for global_step in tqdm.trange(args.steps, disable=not xm.is_master_ordinal()):
96
- #### Get the batch data
97
- batch = queue.get()
98
- #print(index, "batch {}x{}".format(len(batch), ",".join([str(len(b)) for b in batch])))
99
-
100
-
101
- if len(batch[0]) == 2: #(anchor, positive)
102
- text1 = tokenizer([b[0] for b in batch], return_tensors="pt", max_length=args.max_length, truncation=True, padding="max_length")
103
- text2 = tokenizer([b[1] for b in batch], return_tensors="pt", max_length=args.max_length, truncation=True, padding="max_length")
104
-
105
- ### Compute embeddings
106
- embeddings_a = model(**text1.to(device))
107
- embeddings_b = model(**text2.to(device))
108
-
109
- ### Gather all embedings
110
- embeddings_a = torch_xla.core.functions.all_gather(embeddings_a)
111
- embeddings_b = torch_xla.core.functions.all_gather(embeddings_b)
112
-
113
- ### Compute similarity scores 512 x 512
114
- scores = torch.mm(embeddings_a, embeddings_b.transpose(0, 1)) * args.scale
115
-
116
- ### Compute cross-entropy loss
117
- labels = torch.tensor(range(len(scores)), dtype=torch.long, device=embeddings_a.device) # Example a[i] should match with b[i]
118
-
119
- ## Symmetric loss as in CLIP
120
- loss = (cross_entropy_loss(scores, labels) + cross_entropy_loss(scores.transpose(0, 1), labels)) / 2
121
-
122
- else: #(anchor, positive, negative)
123
- text1 = tokenizer([b[0] for b in batch], return_tensors="pt", max_length=args.max_length, truncation=True, padding="max_length")
124
- text2 = tokenizer([b[1] for b in batch], return_tensors="pt", max_length=args.max_length, truncation=True, padding="max_length")
125
- text3 = tokenizer([b[2] for b in batch], return_tensors="pt", max_length=args.max_length, truncation=True, padding="max_length")
126
-
127
- embeddings_a = model(**text1.to(device))
128
- embeddings_b1 = model(**text2.to(device))
129
- embeddings_b2 = model(**text3.to(device))
130
-
131
- embeddings_a = torch_xla.core.functions.all_gather(embeddings_a)
132
- embeddings_b1 = torch_xla.core.functions.all_gather(embeddings_b1)
133
- embeddings_b2 = torch_xla.core.functions.all_gather(embeddings_b2)
134
-
135
- embeddings_b = torch.cat([embeddings_b1, embeddings_b2])
136
-
137
- ### Compute similarity scores 512 x 1024
138
- scores = torch.mm(embeddings_a, embeddings_b.transpose(0, 1)) * args.scale
139
-
140
- ### Compute cross-entropy loss
141
- labels = torch.tensor(range(len(scores)), dtype=torch.long, device=embeddings_a.device) # Example a[i] should match with b[i]
142
-
143
- ## One-way loss
144
- loss = cross_entropy_loss(scores, labels)
145
-
146
-
147
- # Backward pass
148
- optimizer.zero_grad()
149
- loss.backward()
150
- torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
151
-
152
- xm.optimizer_step(optimizer, barrier=True)
153
- lr_scheduler.step()
154
-
155
-
156
- #Save model
157
- if (global_step+1) % args.save_steps == 0:
158
- output_path = os.path.join(args.output, str(global_step+1))
159
- xm.master_print("save model: "+output_path)
160
- model.save_pretrained(output_path)
161
-
162
-
163
- output_path = os.path.join(args.output, "final")
164
- xm.master_print("save model final: "+ output_path)
165
- model.save_pretrained(output_path)
166
-
167
-
168
- def produce_data(args, queue, filepaths, dataset_indices):
169
- global_batch_size = args.batch_size*args.nprocs #Global batch size
170
- size_per_dataset = int(global_batch_size / args.datasets_per_batch) #How many datasets per batch
171
- num_same_dataset = int(size_per_dataset / args.batch_size)
172
- print("producer", "global_batch_size", global_batch_size)
173
- print("producer", "size_per_dataset", size_per_dataset)
174
- print("producer", "num_same_dataset", num_same_dataset)
175
-
176
- datasets = []
177
- for filepath in filepaths:
178
- if "reddit_" in filepath: #Special dataset class for Reddit files
179
- data_obj = RedditDataset(filepath)
180
- else:
181
- data_obj = Dataset(filepath)
182
- datasets.append(iter(data_obj))
183
-
184
- # Store if dataset is in a 2 col or 3 col format
185
- num_cols = {idx: len(next(dataset)) for idx, dataset in enumerate(datasets)}
186
-
187
- while True:
188
- texts_in_batch = set()
189
- batch_format = None #2 vs 3 col format for this batch
190
-
191
- #Add data from several sub datasets
192
- for _ in range(args.datasets_per_batch):
193
- valid_dataset = False #Check that datasets have the same 2/3 col format
194
- while not valid_dataset:
195
- data_idx = random.choice(dataset_indices)
196
- if batch_format is None:
197
- batch_format = num_cols[data_idx]
198
- valid_dataset = True
199
- else: #Check that this dataset has the same format
200
- valid_dataset = (batch_format == num_cols[data_idx])
201
-
202
- #Get data from this dataset
203
- dataset = datasets[data_idx]
204
- for _ in range(num_same_dataset):
205
- for _ in range(args.nprocs):
206
- batch_device = [] #A batch for one device
207
- while len(batch_device) < args.batch_size:
208
- sample = next(dataset)
209
- in_batch = False
210
- for text in sample:
211
- if text in texts_in_batch:
212
- in_batch = True
213
- break
214
-
215
- if not in_batch:
216
- for text in sample:
217
- texts_in_batch.add(text)
218
- batch_device.append(sample)
219
-
220
- queue.put(batch_device)
221
-
222
-
223
- class RedditDataset:
224
- """
225
- A class that handles the reddit data files
226
- """
227
- def __init__(self, filepath):
228
- self.filepath = filepath
229
-
230
- def __iter__(self):
231
- while True:
232
- with gzip.open(self.filepath, "rt") as fIn:
233
- for line in fIn:
234
- data = json.loads(line)
235
-
236
- if "response" in data and "context" in data:
237
- yield [data["response"], data["context"]]
238
-
239
- class Dataset:
240
- """
241
- A class that handles one dataset
242
- """
243
- def __init__(self, filepath):
244
- self.filepath = filepath
245
-
246
- def __iter__(self):
247
- max_dataset_size = 10*1000*1000 #Cache small datasets in memory
248
- dataset = []
249
- data_format = None
250
-
251
- while dataset is None or len(dataset) == 0:
252
- with gzip.open(self.filepath, "rt") as fIn:
253
- for line in fIn:
254
- data = json.loads(line)
255
- if isinstance(data, dict):
256
- data = data['texts']
257
-
258
- if data_format is None:
259
- data_format = len(data)
260
-
261
- #Ensure that all entries are of the same 2/3 col format
262
- assert len(data) == data_format
263
-
264
- if dataset is not None:
265
- dataset.append(data)
266
- if len(dataset) >= max_dataset_size:
267
- dataset = None
268
-
269
- yield data
270
-
271
- # Data loaded. Now stream to the queue
272
- # Shuffle for each epoch
273
- while True:
274
- random.shuffle(dataset)
275
- for data in dataset:
276
- yield data
277
-
278
-
279
-
280
- if __name__ == "__main__":
281
- parser = argparse.ArgumentParser()
282
- parser.add_argument('--model', default='nreimers/MiniLM-L6-H384-uncased')
283
- parser.add_argument('--steps', type=int, default=2000)
284
- parser.add_argument('--save_steps', type=int, default=10000)
285
- parser.add_argument('--batch_size', type=int, default=64)
286
- parser.add_argument('--max_length', type=int, default=128)
287
- parser.add_argument('--nprocs', type=int, default=8)
288
- parser.add_argument('--datasets_per_batch', type=int, default=2, help="Number of datasets per batch")
289
- parser.add_argument('--scale', type=float, default=20, help="Use 20 for cossim, and 1 when you work with unnormalized embeddings with dot product")
290
- parser.add_argument('--data_folder', default="/data", help="Folder with your dataset files")
291
- parser.add_argument('data_config', help="A data_config.json file")
292
- parser.add_argument('output')
293
- args = parser.parse_args()
294
-
295
- # Ensure global batch size is divisble by data_sample_size
296
- assert (args.batch_size*args.nprocs) % args.datasets_per_batch == 0
297
-
298
- logging.info("Output: "+args.output)
299
- if os.path.exists(args.output):
300
- print("Output folder already exists.")
301
- input("Continue?")
302
-
303
- # Write train script to output path
304
- os.makedirs(args.output, exist_ok=True)
305
-
306
- data_config_path = os.path.join(args.output, 'data_config.json')
307
- copyfile(args.data_config, data_config_path)
308
-
309
- train_script_path = os.path.join(args.output, 'train_script.py')
310
- copyfile(__file__, train_script_path)
311
- with open(train_script_path, 'a') as fOut:
312
- fOut.write("\n\n# Script was called via:\n#python " + " ".join(sys.argv))
313
-
314
-
315
-
316
- #Load data config
317
- with open(args.data_config) as fIn:
318
- data_config = json.load(fIn)
319
-
320
- queue = mp.Queue(maxsize=100*args.nprocs)
321
-
322
- filepaths = []
323
- dataset_indices = []
324
- for idx, data in enumerate(data_config):
325
- filepaths.append(os.path.join(os.path.expanduser(args.data_folder), data['name']))
326
- dataset_indices.extend([idx]*data['weight'])
327
-
328
- # Start producer
329
- p = mp.Process(target=produce_data, args=(args, queue, filepaths, dataset_indices))
330
- p.start()
331
-
332
- # Run training
333
- print("Start processes:", args.nprocs)
334
- xmp.spawn(train_function, args=(args, queue), nprocs=args.nprocs, start_method='fork')
335
- print("Training done")
336
- print("It might be that not all processes exit automatically. In that case you must manually kill this process.")
337
- print("With 'pkill python' you can kill all remaining python processes")
338
- p.kill()
339
- exit()
340
-
341
-
342
-
343
- # Script was called via:
344
- #python train_many_data_files_v2.py --steps 1000000 --batch_size 128 --model nreimers/MiniLM-L6-H384-uncased train_data_configs/all_datasets_v4.json output/all_datasets_v4_MiniLM-L6-H384-uncased-batch128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/vocab.txt DELETED
The diff for this file is too large to render. See raw diff
 
models/models--sentence-transformers--all-MiniLM-L6-v2/blobs/53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db
3
- size 90868376
 
 
 
 
models/models--sentence-transformers--all-MiniLM-L6-v2/blobs/58d4a9a45664eb9e12de9549c548c09b6134c17f DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7dfc82496ec33f906b5b0d6750c1e2397da6530c74d1ae3568c55bc2739125e7
3
- size 10454
 
 
 
 
models/models--sentence-transformers--all-MiniLM-L6-v2/blobs/cb202bfe2e3c98645018a6d12f182a434c9d3e02 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:be50c3628f2bf5bb5e3a7f17b1f74611b2561a3a27eeab05e5aa30f411572037
3
- size 466247
 
 
 
 
models/models--sentence-transformers--all-MiniLM-L6-v2/blobs/fb140275c155a9c7c5a3b3e0e77a9e839594a938 DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3
3
- size 231508
 
 
 
 
models/sentence-transformers_all-MiniLM-L6-v2/README.md DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7dfc82496ec33f906b5b0d6750c1e2397da6530c74d1ae3568c55bc2739125e7
3
- size 10454
 
 
 
 
models/sentence-transformers_all-MiniLM-L6-v2/onnx/model_qint8_avx512.onnx DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:4278337fd0ff3c68bfb6291042cad8ab363e1d9fbc43dcb499fe91c871902474
3
- size 23026053
 
 
 
 
models/sentence-transformers_all-MiniLM-L6-v2/onnx/model_qint8_avx512_vnni.onnx DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:4278337fd0ff3c68bfb6291042cad8ab363e1d9fbc43dcb499fe91c871902474
3
- size 23026053