Spaces:

destinyebuka
/

AIDA

Running

App Files Files Community

destinyebuka commited on 10 days ago

Commit

b398556

1 Parent(s): 03ebe50

fyp

Browse files

Files changed (29) hide show

app/ai/routes/chat.py +18 -23
app/ai/tools/listing_tool.py +229 -55
app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/README.md +0 -173
app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/data_config.json +0 -1452
app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/model.safetensors +0 -3
app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/onnx/model.onnx +0 -3
app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/onnx/model_O1.onnx +0 -3
app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/onnx/model_O2.onnx +0 -3
app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/onnx/model_O3.onnx +0 -3
app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/onnx/model_O4.onnx +0 -3
app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/onnx/model_qint8_arm64.onnx +0 -3
app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/onnx/model_qint8_avx512.onnx +0 -3
app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/onnx/model_qint8_avx512_vnni.onnx +0 -3
app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/onnx/model_quint8_avx2.onnx +0 -3
app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/openvino/openvino_model.bin +0 -3
app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/openvino/openvino_model.xml +0 -0
app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/openvino/openvino_model_qint8_quantized.bin +0 -3
app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/openvino/openvino_model_qint8_quantized.xml +0 -0
app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/pytorch_model.bin +0 -3
app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/tokenizer.json +0 -0
app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/train_script.py +0 -344
app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/vocab.txt +0 -0
models/models--sentence-transformers--all-MiniLM-L6-v2/blobs/53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db +0 -3
models/models--sentence-transformers--all-MiniLM-L6-v2/blobs/58d4a9a45664eb9e12de9549c548c09b6134c17f +0 -3
models/models--sentence-transformers--all-MiniLM-L6-v2/blobs/cb202bfe2e3c98645018a6d12f182a434c9d3e02 +0 -3
models/models--sentence-transformers--all-MiniLM-L6-v2/blobs/fb140275c155a9c7c5a3b3e0e77a9e839594a938 +0 -3
models/sentence-transformers_all-MiniLM-L6-v2/README.md +0 -3
models/sentence-transformers_all-MiniLM-L6-v2/onnx/model_qint8_avx512.onnx +0 -3
models/sentence-transformers_all-MiniLM-L6-v2/onnx/model_qint8_avx512_vnni.onnx +0 -3

app/ai/routes/chat.py CHANGED Viewed

@@ -1,5 +1,5 @@
-# app/ai/routes/chat.py - FINAL VERSION
-# Sends only draft data, Flutter builds native UI
 from fastapi import APIRouter, Depends, HTTPException, BackgroundTasks
 from fastapi.security import HTTPBearer
@@ -33,7 +33,6 @@ class AskBody(BaseModel):
     session_id: Optional[str] = None
     thread_id: Optional[str] = None
     start_new_session: Optional[bool] = False
-    image_urls: Optional[List[str]] = None  # URLs from Cloudflare (client-side uploaded)
 class ChatResponse(BaseModel):
@@ -42,6 +41,7 @@ class ChatResponse(BaseModel):
     action: str
     state: Optional[Dict[str, Any]] = None
     draft: Optional[Dict[str, Any]] = None
     mongo_id: Optional[str] = None
     error: Optional[str] = None
@@ -118,9 +118,13 @@ async def ask_ai(
     """
     Main chat endpoint with:
     - Greeting detection & response
-    - Simplified listing flow
-    - Image URL handling from Cloudflare
-    - Returns only draft data (NO HTML)
     Flow:
     1. Authenticate
@@ -128,7 +132,7 @@ async def ask_ai(
     3. Get/create memory
     4. Check for greeting
     5. Detect intent (listing, publish, edit, etc.)
-    6. Process accordingly
     7. Return response
     """
@@ -157,7 +161,6 @@ async def ask_ai(
             user_id=user_id,
             session_id=session_id,
             status=context.get("status"),
-            has_images=bool(body.image_urls),
         )
         # CHECK RESET
@@ -173,7 +176,6 @@ async def ask_ai(
             context["user_role"] = user_role
             await memory.update_context(context)
             await memory.clear()
-            # Continue with normal message processing
         # INIT CONTEXT IF NEW
         if not context:
@@ -200,11 +202,9 @@ async def ask_ai(
                 user_role=user_role
             )
-            # Add to history
             await memory.add_message("user", body.message)
             await memory.add_message("assistant", greeting_result["reply"])
-            # Update context
             context["last_activity"] = datetime.utcnow().isoformat()
             context["status"] = greeting_result["state"].get("status", "idle")
             await memory.update_context(context)
@@ -231,10 +231,7 @@ async def ask_ai(
                 "images": [],
             })
-            # ✅ ADD IMAGE URLs from Cloudflare (client-side uploaded)
-            if body.image_urls:
-                logger.info(f"Adding {len(body.image_urls)} image URLs to listing", user_id=user_id)
-                listing_state["images"].extend(body.image_urls)
             # Process listing
             result = await process_listing(
@@ -242,7 +239,6 @@ async def ask_ai(
                 user_id=user_id,
                 user_role=user_role,
                 current_state=listing_state,
-                image_urls=listing_state.get("images", []),
             )
             # Update context
@@ -264,7 +260,8 @@ async def ask_ai(
                 text=result["reply"],
                 action=result["action"],
                 state=context,
-                draft=result.get("draft"),  # ✅ Only send draft data, NO HTML
                 error=result.get("error")
             )
@@ -279,11 +276,11 @@ async def ask_ai(
                 # from app.database import get_db
                 # db = await get_db()
                 # listing = await db.listings.insert_one(draft)
-                # listing_id = listing.inserted_id
                 logger.info("Listing published", user_id=user_id, title=draft.get("title"))
-                # Clear listing state
                 context["status"] = "idle"
                 context["listing_state"] = {}
                 context["draft"] = None
@@ -368,7 +365,7 @@ async def ask_ai(
                 text=reply,
                 action="show_draft",
                 state=context,
-                draft=draft,  # ✅ Only send draft data, NO HTML
             )
         # 5. DISCARD DRAFT
@@ -407,11 +404,9 @@ async def ask_ai(
                     conversation_context=context
                 )
-                # Add to history
                 await memory.add_message("user", body.message)
                 await memory.add_message("assistant", reply)
-                # Update context
                 context["last_activity"] = datetime.utcnow().isoformat()
                 if "tool" in tool_result:
                     context["last_tool"] = tool_result["tool"]
@@ -460,7 +455,7 @@ async def health_check():
     """Health check for chat service"""
     return {
         "status": "healthy",
-        "service": "Aida Chat with Native Flutter UI",
         "langsmith": "enabled" if os.getenv("LANGCHAIN_API_KEY") else "disabled",
     }

+# app/ai/routes/chat.py - UPDATED FOR FRONTEND IMAGE URLs
+# Images are now extracted from message text, no separate upload endpoint needed
 from fastapi import APIRouter, Depends, HTTPException, BackgroundTasks
 from fastapi.security import HTTPBearer
     session_id: Optional[str] = None
     thread_id: Optional[str] = None
     start_new_session: Optional[bool] = False
 class ChatResponse(BaseModel):
     action: str
     state: Optional[Dict[str, Any]] = None
     draft: Optional[Dict[str, Any]] = None
+    draft_ui: Optional[Dict[str, Any]] = None  # UI component for draft preview
     mongo_id: Optional[str] = None
     error: Optional[str] = None
     """
     Main chat endpoint with:
     - Greeting detection & response
+    - Simplified listing flow with IMAGE URL EXTRACTION from message
+    - Returns draft data WITH UI COMPONENT
+    Image Handling:
+    - Frontend uploads image and gets URL
+    - User sends message: "Here's the property image: https://..."
+    - Aida extracts URL from message and stores it
     Flow:
     1. Authenticate
     3. Get/create memory
     4. Check for greeting
     5. Detect intent (listing, publish, edit, etc.)
+    6. Process accordingly (URLs extracted from messages)
     7. Return response
     """
             user_id=user_id,
             session_id=session_id,
             status=context.get("status"),
         )
         # CHECK RESET
             context["user_role"] = user_role
             await memory.update_context(context)
             await memory.clear()
         # INIT CONTEXT IF NEW
         if not context:
                 user_role=user_role
             )
             await memory.add_message("user", body.message)
             await memory.add_message("assistant", greeting_result["reply"])
             context["last_activity"] = datetime.utcnow().isoformat()
             context["status"] = greeting_result["state"].get("status", "idle")
             await memory.update_context(context)
                 "images": [],
             })
+            # ✅ NO SEPARATE IMAGE URLS - process_listing will extract from message
             # Process listing
             result = await process_listing(
                 user_id=user_id,
                 user_role=user_role,
                 current_state=listing_state,
             )
             # Update context
                 text=result["reply"],
                 action=result["action"],
                 state=context,
+                draft=result.get("draft"),
+                draft_ui=result.get("draft_ui"),  # UI component
                 error=result.get("error")
             )
                 # from app.database import get_db
                 # db = await get_db()
                 # listing = await db.listings.insert_one(draft)
+                # mongo_id = str(listing.inserted_id)
                 logger.info("Listing published", user_id=user_id, title=draft.get("title"))
+                # CLEAR LISTING STATE AND SET STATUS TO IDLE
                 context["status"] = "idle"
                 context["listing_state"] = {}
                 context["draft"] = None
                 text=reply,
                 action="show_draft",
                 state=context,
+                draft=draft,
             )
         # 5. DISCARD DRAFT
                     conversation_context=context
                 )
                 await memory.add_message("user", body.message)
                 await memory.add_message("assistant", reply)
                 context["last_activity"] = datetime.utcnow().isoformat()
                 if "tool" in tool_result:
                     context["last_tool"] = tool_result["tool"]
     """Health check for chat service"""
     return {
         "status": "healthy",
+        "service": "Aida Chat with Frontend Image URLs",
         "langsmith": "enabled" if os.getenv("LANGCHAIN_API_KEY") else "disabled",
     }

app/ai/tools/listing_tool.py CHANGED Viewed

@@ -1,8 +1,8 @@
 # app/ai/tools/listing_tool.py
-# FINAL VERSION: Simplified listing logic - NO HTML generation
-# Backend sends only draft data, Flutter builds native UI
 import json
 from typing import Dict, Optional, Tuple, List
 from pydantic import BaseModel, Field
 from structlog import get_logger
@@ -29,44 +29,142 @@ llm = ChatOpenAI(
 )
-# ========== STEP 1: SHOW EXAMPLE ==========
 async def generate_listing_example(user_language: str, user_role: str) -> str:
     """
-    Generate a SHORT, realistic listing example in user's language.
-    Different each time (not hardcoded).
-    Shows all fields: bedrooms, bathrooms, location, price, amenities, requirements
-    Format: Natural sentence (NOT a list)
     """
-    logger.info("Generating listing example", language=user_language, role=user_role)
     try:
         role_context = "as a landlord renting an apartment" if user_role == "landlord" else "as a renter looking for a roommate to share your apartment"
-        prompt = f"""Generate a SHORT, realistic property listing example {role_context} in {user_language}.
 Requirements:
 - Keep it 2-3 sentences MAXIMUM
-- Include ALL of these: location, bedrooms, bathrooms, price, price_type, at least one amenity, one requirement
 - Format: Natural sentence (NOT a list or bullet points)
 - Language: Respond ONLY in {user_language}, no mixing
 - Realistic: Use real cities and reasonable prices
 Example format (DO NOT copy exactly):
-"I have a 2-bedroom, 1-bathroom apartment in [city] for [price] per [time] with wifi and parking. Tenant must provide 3-month deposit."
-Now generate YOUR OWN unique example in {user_language}:"""
         response = await llm.ainvoke([
-            SystemMessage(content="You are Aida, a real estate assistant. Generate SHORT, realistic property listing examples. Keep them natural, conversational, under 3 sentences."),
-            HumanMessage(content=prompt)
         ])
         example = response.content if hasattr(response, 'content') else str(response)
-        logger.info("Example generated successfully", length=len(example))
         return example.strip()
     except Exception as e:
@@ -102,6 +200,7 @@ Extract these fields (set to null if not mentioned):
 Important:
 - Be smart about understanding intent (typos, informal language)
 - Extract numbers from text (e.g., "2bd" = 2, "50k" = 50000)
 - Return ONLY valid JSON, nothing else
 Return JSON ONLY:
@@ -150,17 +249,7 @@ async def auto_detect_listing_type(
     user_role: str,
     user_message: str = ""
 ) -> str:
-    """
-    Auto-detect listing type based on SIMPLE RULES:
-    For Landlord:
-    - monthly OR yearly → "rent"
-    - weekly OR daily OR nightly → "short-stay"
-    - "for sale" OR "selling" in message → "sale"
-    For Renter:
-    - ALWAYS → "roommate"
-    """
     if user_role == "renter":
         return "roommate"
@@ -182,10 +271,7 @@ async def auto_detect_listing_type(
 # ========== STEP 4: AUTO-DETECT CURRENCY ==========
 async def get_currency_for_location(location: str) -> str:
-    """
-    Get currency for location using ML extractor.
-    ML extractor handles geolocation + currency detection.
-    """
     try:
         currency, city, confidence = await ml_extractor.infer_currency(
@@ -213,7 +299,7 @@ async def get_currency_for_location(location: str) -> str:
         "london": "GBP", "manchester": "GBP", "edinburgh": "GBP",
         "paris": "EUR", "lyon": "EUR", "marseille": "EUR",
         "madrid": "EUR", "barcelona": "EUR", "valencia": "EUR",
-        "newyork": "USD", "new york": "USD", "losangeles": "USD", "chicago": "USD",
         "portland": "USD", "seattle": "USD", "san francisco": "USD",
     }
@@ -312,6 +398,59 @@ Return ONLY valid JSON:
         return title, description
 # ========== MAIN PROCESS LISTING ==========
 async def process_listing(
@@ -319,18 +458,18 @@ async def process_listing(
     user_id: str,
     user_role: str,
     current_state: Optional[Dict] = None,
-    image_urls: Optional[List[str]] = None,
 ) -> Dict:
     """
-    Process listing with SIMPLIFIED LOGIC:
-    1. Show example first time
-    2. Extract fields
-    3. Ask missing required fields ONE AT A TIME
-    4. Ask about amenities/requirements ONCE
-    5. Auto-detect: currency, listing_type, title, description
-    6. Generate draft
-    7. Return draft for Flutter UI to display
     """
     logger.info("Processing listing", user_id=user_id, user_role=user_role)
@@ -339,19 +478,31 @@ async def process_listing(
         "status": "listing",
         "step": "initial",
         "provided_fields": {},
-        "images": image_urls or [],
     }
-    # STEP 1: Show example if first time
     if state.get("step") == "initial":
-        logger.info("First time listing - showing example")
         example = await generate_listing_example("en", user_role)  # TODO: Detect user language
         return {
             "success": True,
             "action": "show_example",
-            "reply": f"Great! 🏠 Here's an example of how you could describe it:\n\n\"{example}\"\n\nNow tell me about your property.",
             "data": {},
             "state": {
                 "status": "listing",
@@ -370,7 +521,7 @@ async def process_listing(
         if value is not None and value != [] and value != "":
             provided_fields[key] = value
-    logger.info("Fields collected so far", provided=list(provided_fields.keys()))
     # STEP 3: Check for missing required fields
     missing_fields = [f for f in REQUIRED_FIELDS if f not in provided_fields or provided_fields[f] is None]
@@ -400,7 +551,7 @@ async def process_listing(
                 "step": "collecting_fields",
                 "provided_fields": provided_fields,
                 "missing_fields": missing_fields,
-                "images": state.get("images", []),
             }
         }
@@ -417,11 +568,30 @@ async def process_listing(
                 "status": "listing",
                 "step": "collecting_optional",
                 "provided_fields": provided_fields,
-                "images": state.get("images", []),
             }
         }
-    # STEP 5: Auto-detect listing_type and currency
     listing_type = await auto_detect_listing_type(
         price_type=provided_fields.get("price_type", ""),
         user_role=user_role,
@@ -439,10 +609,10 @@ async def process_listing(
         user_id=user_id
     )
-    # STEP 6: Generate title and description
     title, description = await generate_title_and_description(provided_fields, user_role)
-    # STEP 7: Build draft
     draft = {
         "user_id": user_id,
         "user_role": user_role,
@@ -457,12 +627,15 @@ async def process_listing(
         "listing_type": provided_fields.get("listing_type"),
         "amenities": provided_fields.get("amenities", []),
         "requirements": provided_fields.get("requirements"),
-        "images": state.get("images", []),  # Images from Cloudflare (client-side upload)
     }
-    logger.info("Draft ready for preview", title=title, location=provided_fields.get("location"))
-    # STEP 8: Return draft (Flutter builds the UI)
     return {
         "success": True,
         "action": "show_draft",
@@ -472,7 +645,8 @@ async def process_listing(
             "status": "listing",
             "step": "preview_ready",
             "provided_fields": provided_fields,
-            "images": state.get("images", []),
         },
         "draft": draft,
     }

 # app/ai/tools/listing_tool.py
+# FINAL VERSION: Random examples + AI-powered URL extraction
 import json
+import re
 from typing import Dict, Optional, Tuple, List
 from pydantic import BaseModel, Field
 from structlog import get_logger
 )
+# ========== AI-POWERED URL EXTRACTION ==========
+async def extract_image_urls_from_message(user_message: str) -> List[str]:
+    """
+    AI-powered image URL extraction using LLM.
+    The LLM is smarter than regex:
+    - Understands context
+    - Handles edge cases
+    - Filters out non-image URLs
+    - Extracts from various formats
+    Returns:
+        List of image URLs found in message
+    """
+    logger.info("Extracting image URLs with AI", msg_len=len(user_message))
+    try:
+        prompt_text = f"""Extract image URLs from this user message.
+User message: "{user_message}"
+Your task:
+1. Look for URLs in the message
+2. Identify which ones are likely image URLs (jpg, png, gif, webp, cloudflare, etc.)
+3. Extract ONLY image URLs, NOT other types
+4. Return as JSON array
+Important:
+- Image URLs usually end in: .jpg, .png, .gif, .webp, or contain "imagedelivery", "cloudinary", "imgur", etc.
+- Include full URLs with https://
+- Exclude URLs that are clearly not images (don't include docs, videos, etc.)
+- If no image URLs found, return empty array
+- Return ONLY valid JSON, nothing else
+Return JSON ONLY:
+{{
+  "urls": ["https://...", "https://..."] or []
+}}"""
+        messages = [
+            SystemMessage(content="You are a URL extraction expert. Identify and extract image URLs from text. Return ONLY valid JSON with 'urls' array."),
+            HumanMessage(content=prompt_text)
+        ]
+        response = await llm.ainvoke(messages)
+        response_text = response.content if hasattr(response, 'content') else str(response)
+        logger.info("LLM extraction response", response=response_text[:100])
+        # Parse JSON from response
+        try:
+            result = json.loads(response_text)
+            urls = result.get("urls", [])
+            # Validate URLs
+            valid_urls = []
+            for url in urls:
+                if isinstance(url, str) and (url.startswith("http://") or url.startswith("https://")):
+                    valid_urls.append(url)
+            if valid_urls:
+                logger.info("Extracted image URLs with AI", count=len(valid_urls), urls=[u[:60] + "..." for u in valid_urls])
+            return valid_urls
+        except json.JSONDecodeError:
+            # Try to extract JSON from response
+            json_match = re.search(r'\{.*\}', response_text, re.DOTALL)
+            if json_match:
+                try:
+                    result = json.loads(json_match.group())
+                    urls = result.get("urls", [])
+                    return [u for u in urls if isinstance(u, str) and u.startswith(("http://", "https://"))]
+                except:
+                    return []
+            return []
+    except Exception as e:
+        logger.error("AI URL extraction failed", exc_info=e)
+        return []
+# ========== STEP 1: SHOW RANDOM EXAMPLE ==========
 async def generate_listing_example(user_language: str, user_role: str) -> str:
     """
+    Generate a RANDOM, unique listing example each time.
+    Different every time because:
+    - Random locations
+    - Random prices
+    - Random amenities
+    - Random requirements
+    - Different phrasing/structure
+    Result: Users never see the same example twice!
     """
+    logger.info("Generating random listing example", language=user_language, role=user_role)
     try:
         role_context = "as a landlord renting an apartment" if user_role == "landlord" else "as a renter looking for a roommate to share your apartment"
+        prompt_text = f"""Generate a UNIQUE, realistic property listing example {role_context} in {user_language}.
+IMPORTANT: Generate a DIFFERENT example each time. Vary:
+- Location (city name, area)
+- Number of bedrooms/bathrooms
+- Price amount
+- Amenities (different set each time)
+- Requirements (different each time)
+- Phrasing and structure
 Requirements:
 - Keep it 2-3 sentences MAXIMUM
+- Include ALL of these fields: location, bedrooms, bathrooms, price, price_type, at least one amenity, one requirement
 - Format: Natural sentence (NOT a list or bullet points)
 - Language: Respond ONLY in {user_language}, no mixing
 - Realistic: Use real cities and reasonable prices
+- DIFFERENT: Make it unique from previous examples
 Example format (DO NOT copy exactly):
+"I have a 2-bedroom, 1-bathroom apartment in [CITY] for [PRICE] per [TIME] with [AMENITY1] and [AMENITY2]. [REQUIREMENT]."
+Now generate YOUR OWN unique example in {user_language}. Make it different from typical examples:"""
         response = await llm.ainvoke([
+            SystemMessage(content="You are Aida, a creative real estate assistant. Generate UNIQUE, realistic property listing examples. Keep them natural, conversational, under 3 sentences. Each example should be different from the last."),
+            HumanMessage(content=prompt_text)
         ])
         example = response.content if hasattr(response, 'content') else str(response)
+        logger.info("Random example generated successfully", length=len(example))
         return example.strip()
     except Exception as e:
 Important:
 - Be smart about understanding intent (typos, informal language)
 - Extract numbers from text (e.g., "2bd" = 2, "50k" = 50000)
+- IGNORE URLs - do NOT try to extract fields from URLs
 - Return ONLY valid JSON, nothing else
 Return JSON ONLY:
     user_role: str,
     user_message: str = ""
 ) -> str:
+    """Auto-detect listing type based on SIMPLE RULES."""
     if user_role == "renter":
         return "roommate"
 # ========== STEP 4: AUTO-DETECT CURRENCY ==========
 async def get_currency_for_location(location: str) -> str:
+    """Get currency for location using ML extractor."""
     try:
         currency, city, confidence = await ml_extractor.infer_currency(
         "london": "GBP", "manchester": "GBP", "edinburgh": "GBP",
         "paris": "EUR", "lyon": "EUR", "marseille": "EUR",
         "madrid": "EUR", "barcelona": "EUR", "valencia": "EUR",
+        "austin": "USD", "newyork": "USD", "new york": "USD", "losangeles": "USD", "chicago": "USD",
         "portland": "USD", "seattle": "USD", "san francisco": "USD",
     }
         return title, description
+# ========== BUILD DRAFT UI COMPONENT ==========
+def build_draft_ui_component(draft: Dict) -> Dict:
+    """
+    Build UI component data for draft preview.
+    Frontend uses this to render the draft preview UI.
+    """
+    amenities_icons = {
+        "wifi": "📶",
+        "parking": "🅿️",
+        "furnished": "🛋️",
+        "washing machine": "🧼",
+        "dryer": "🌪️",
+        "ac": "🌬️",
+        "air conditioning": "🌬️",
+        "balcony": "🏠",
+        "pool": "🏊",
+        "gym": "💪",
+        "garden": "🌳",
+        "kitchen": "🍳",
+    }
+    # Build amenities with icons
+    amenities = draft.get("amenities", [])
+    amenities_display = []
+    for amenity in amenities:
+        icon = amenities_icons.get(amenity.lower(), "✓")
+        amenities_display.append(f"{icon} {amenity.capitalize()}")
+    ui_component = {
+        "component_type": "listing_draft_preview",
+        "title": draft.get("title"),
+        "description": draft.get("description"),
+        "location": draft.get("location"),
+        "bedrooms": draft.get("bedrooms"),
+        "bathrooms": draft.get("bathrooms"),
+        "price": draft.get("price"),
+        "price_type": draft.get("price_type"),
+        "currency": draft.get("currency"),
+        "listing_type": draft.get("listing_type"),
+        "amenities": amenities,
+        "amenities_display": " | ".join(amenities_display) if amenities_display else "No amenities",
+        "requirements": draft.get("requirements") or "No special requirements",
+        "images": draft.get("images", []),
+        "images_count": len(draft.get("images", [])),
+        "user_id": draft.get("user_id"),
+        "actions": ["publish", "edit", "discard"],
+    }
+    return ui_component
 # ========== MAIN PROCESS LISTING ==========
 async def process_listing(
     user_id: str,
     user_role: str,
     current_state: Optional[Dict] = None,
 ) -> Dict:
     """
+    Process listing with UPDATED LOGIC:
+    1. Show RANDOM example first time
+    2. AI-extract image URLs from message
+    3. Extract fields
+    4. Ask missing required fields ONE AT A TIME
+    5. Ask about amenities/requirements ONCE
+    6. AUTO-REQUIRE AT LEAST 1 IMAGE BEFORE DRAFT
+    7. Generate draft WITH UI COMPONENT
+    8. Return draft for Flutter UI to display
     """
     logger.info("Processing listing", user_id=user_id, user_role=user_role)
         "status": "listing",
         "step": "initial",
         "provided_fields": {},
+        "images": [],
     }
+    # ========== AI-POWERED: EXTRACT IMAGE URLs FROM MESSAGE ==========
+    extracted_urls = await extract_image_urls_from_message(user_message)
+    # Add extracted URLs to images list (avoid duplicates)
+    current_images = state.get("images", [])
+    for url in extracted_urls:
+        if url not in current_images:
+            current_images.append(url)
+            logger.info(f"Added image URL from message via AI extraction", url=url[:60] + "...")
+    state["images"] = current_images
+    # STEP 1: Show RANDOM example if first time
     if state.get("step") == "initial":
+        logger.info("First time listing - generating random example")
         example = await generate_listing_example("en", user_role)  # TODO: Detect user language
         return {
             "success": True,
             "action": "show_example",
+            "reply": f"Great! 🏠 Here's an example of how you could describe it:\n\n\"{example}\"\n\nNow tell me about your property. You can also upload images by sharing the image URL.",
             "data": {},
             "state": {
                 "status": "listing",
         if value is not None and value != [] and value != "":
             provided_fields[key] = value
+    logger.info("Fields collected so far", provided=list(provided_fields.keys()), images=len(current_images))
     # STEP 3: Check for missing required fields
     missing_fields = [f for f in REQUIRED_FIELDS if f not in provided_fields or provided_fields[f] is None]
                 "step": "collecting_fields",
                 "provided_fields": provided_fields,
                 "missing_fields": missing_fields,
+                "images": current_images,
             }
         }
                 "status": "listing",
                 "step": "collecting_optional",
                 "provided_fields": provided_fields,
+                "images": current_images,
+            }
+        }
+    # STEP 5: CHECK FOR IMAGES - REQUIRE AT LEAST 1
+    if not current_images or len(current_images) == 0:
+        logger.info("No images provided - asking user to upload", user_id=user_id)
+        return {
+            "success": True,
+            "action": "ask_images",
+            "reply": "📷 Please share at least one image of your property by sending the image URL. Example: 'Here's the property image: https://imagedelivery.net/...' This helps buyers/renters see what they're getting!",
+            "data": provided_fields,
+            "state": {
+                "status": "listing",
+                "step": "waiting_for_images",
+                "provided_fields": provided_fields,
+                "images": [],
             }
         }
+    logger.info("Images provided", image_count=len(current_images), user_id=user_id)
+    # STEP 6: Auto-detect listing_type and currency
     listing_type = await auto_detect_listing_type(
         price_type=provided_fields.get("price_type", ""),
         user_role=user_role,
         user_id=user_id
     )
+    # STEP 7: Generate title and description
     title, description = await generate_title_and_description(provided_fields, user_role)
+    # STEP 8: Build draft
     draft = {
         "user_id": user_id,
         "user_role": user_role,
         "listing_type": provided_fields.get("listing_type"),
         "amenities": provided_fields.get("amenities", []),
         "requirements": provided_fields.get("requirements"),
+        "images": current_images,  # ✅ Images from AI-extracted URLs
     }
+    # STEP 9: Build UI component for draft preview
+    draft_ui = build_draft_ui_component(draft)
+    logger.info("Draft with UI component ready for preview", title=title, location=provided_fields.get("location"), image_count=len(current_images))
+    # STEP 10: Return draft with UI component
     return {
         "success": True,
         "action": "show_draft",
             "status": "listing",
             "step": "preview_ready",
             "provided_fields": provided_fields,
+            "images": current_images,
         },
         "draft": draft,
+        "draft_ui": draft_ui,  # ✅ UI component for frontend
     }

app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/README.md DELETED Viewed

@@ -1,173 +0,0 @@
----
-language: en
-license: apache-2.0
-library_name: sentence-transformers
-tags:
-- sentence-transformers
-- feature-extraction
-- sentence-similarity
-- transformers
-datasets:
-- s2orc
-- flax-sentence-embeddings/stackexchange_xml
-- ms_marco
-- gooaq
-- yahoo_answers_topics
-- code_search_net
-- search_qa
-- eli5
-- snli
-- multi_nli
-- wikihow
-- natural_questions
-- trivia_qa
-- embedding-data/sentence-compression
-- embedding-data/flickr30k-captions
-- embedding-data/altlex
-- embedding-data/simple-wiki
-- embedding-data/QQP
-- embedding-data/SPECTER
-- embedding-data/PAQ_pairs
-- embedding-data/WikiAnswers
-pipeline_tag: sentence-similarity
----
-# all-MiniLM-L6-v2
-This is a [sentence-transformers](https://www.SBERT.net) model: It maps sentences & paragraphs to a 384 dimensional dense vector space and can be used for tasks like clustering or semantic search.
-## Usage (Sentence-Transformers)
-Using this model becomes easy when you have [sentence-transformers](https://www.SBERT.net) installed:
-```
-pip install -U sentence-transformers
-```
-Then you can use the model like this:
-```python
-from sentence_transformers import SentenceTransformer
-sentences = ["This is an example sentence", "Each sentence is converted"]
-model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
-embeddings = model.encode(sentences)
-print(embeddings)
-```
-## Usage (HuggingFace Transformers)
-Without [sentence-transformers](https://www.SBERT.net), you can use the model like this: First, you pass your input through the transformer model, then you have to apply the right pooling-operation on-top of the contextualized word embeddings.
-```python
-from transformers import AutoTokenizer, AutoModel
-import torch
-import torch.nn.functional as F
-#Mean Pooling - Take attention mask into account for correct averaging
-def mean_pooling(model_output, attention_mask):
-    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
-    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
-    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
-# Sentences we want sentence embeddings for
-sentences = ['This is an example sentence', 'Each sentence is converted']
-# Load model from HuggingFace Hub
-tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
-model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
-# Tokenize sentences
-encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
-# Compute token embeddings
-with torch.no_grad():
-    model_output = model(**encoded_input)
-# Perform pooling
-sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
-# Normalize embeddings
-sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
-print("Sentence embeddings:")
-print(sentence_embeddings)
-```
-------
-## Background
-The project aims to train sentence embedding models on very large sentence level datasets using a self-supervised
-contrastive learning objective. We used the pretrained [`nreimers/MiniLM-L6-H384-uncased`](https://huggingface.co/nreimers/MiniLM-L6-H384-uncased) model and fine-tuned in on a
-1B sentence pairs dataset. We use a contrastive learning objective: given a sentence from the pair, the model should predict which out of a set of randomly sampled other sentences, was actually paired with it in our dataset.
-We developed this model during the
-[Community week using JAX/Flax for NLP & CV](https://discuss.huggingface.co/t/open-to-the-community-community-week-using-jax-flax-for-nlp-cv/7104),
-organized by Hugging Face. We developed this model as part of the project:
-[Train the Best Sentence Embedding Model Ever with 1B Training Pairs](https://discuss.huggingface.co/t/train-the-best-sentence-embedding-model-ever-with-1b-training-pairs/7354). We benefited from efficient hardware infrastructure to run the project: 7 TPUs v3-8, as well as intervention from Googles Flax, JAX, and Cloud team member about efficient deep learning frameworks.
-## Intended uses
-Our model is intended to be used as a sentence and short paragraph encoder. Given an input text, it outputs a vector which captures
-the semantic information. The sentence vector may be used for information retrieval, clustering or sentence similarity tasks.
-By default, input text longer than 256 word pieces is truncated.
-## Training procedure
-### Pre-training
-We use the pretrained [`nreimers/MiniLM-L6-H384-uncased`](https://huggingface.co/nreimers/MiniLM-L6-H384-uncased) model. Please refer to the model card for more detailed information about the pre-training procedure.
-### Fine-tuning
-We fine-tune the model using a contrastive objective. Formally, we compute the cosine similarity from each possible sentence pairs from the batch.
-We then apply the cross entropy loss by comparing with true pairs.
-#### Hyper parameters
-We trained our model on a TPU v3-8. We train the model during 100k steps using a batch size of 1024 (128 per TPU core).
-We use a learning rate warm up of 500. The sequence length was limited to 128 tokens. We used the AdamW optimizer with
-a 2e-5 learning rate. The full training script is accessible in this current repository: `train_script.py`.
-#### Training data
-We use the concatenation from multiple datasets to fine-tune our model. The total number of sentence pairs is above 1 billion sentences.
-We sampled each dataset given a weighted probability which configuration is detailed in the `data_config.json` file.
-| Dataset                                                  | Paper                                    | Number of training tuples  |
-|--------------------------------------------------------|:----------------------------------------:|:--------------------------:|
-| [Reddit comments (2015-2018)](https://github.com/PolyAI-LDN/conversational-datasets/tree/master/reddit) | [paper](https://arxiv.org/abs/1904.06472) | 726,484,430 |
-| [S2ORC](https://github.com/allenai/s2orc) Citation pairs (Abstracts) | [paper](https://aclanthology.org/2020.acl-main.447/) | 116,288,806 |
-| [WikiAnswers](https://github.com/afader/oqa#wikianswers-corpus) Duplicate question pairs | [paper](https://doi.org/10.1145/2623330.2623677) | 77,427,422 |
-| [PAQ](https://github.com/facebookresearch/PAQ) (Question, Answer) pairs | [paper](https://arxiv.org/abs/2102.07033) | 64,371,441 |
-| [S2ORC](https://github.com/allenai/s2orc) Citation pairs (Titles) | [paper](https://aclanthology.org/2020.acl-main.447/) | 52,603,982 |
-| [S2ORC](https://github.com/allenai/s2orc) (Title, Abstract) | [paper](https://aclanthology.org/2020.acl-main.447/) | 41,769,185 |
-| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Body) pairs  | - | 25,316,456 |
-| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title+Body, Answer) pairs  | - | 21,396,559 |
-| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) (Title, Answer) pairs  | - | 21,396,559 |
-| [MS MARCO](https://microsoft.github.io/msmarco/) triplets | [paper](https://doi.org/10.1145/3404835.3462804) | 9,144,553 |
-| [GOOAQ: Open Question Answering with Diverse Answer Types](https://github.com/allenai/gooaq) | [paper](https://arxiv.org/pdf/2104.08727.pdf) | 3,012,496 |
-| [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Answer) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 1,198,260 |
-| [Code Search](https://huggingface.co/datasets/code_search_net) | - | 1,151,414 |
-| [COCO](https://cocodataset.org/#home) Image captions | [paper](https://link.springer.com/chapter/10.1007%2F978-3-319-10602-1_48) | 828,395|
-| [SPECTER](https://github.com/allenai/specter) citation triplets | [paper](https://doi.org/10.18653/v1/2020.acl-main.207) | 684,100 |
-| [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Question, Answer) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 681,164 |
-| [Yahoo Answers](https://www.kaggle.com/soumikrakshit/yahoo-answers-dataset) (Title, Question) | [paper](https://proceedings.neurips.cc/paper/2015/hash/250cf8b51c773f3f8dc8b4be867a9a02-Abstract.html) | 659,896 |
-| [SearchQA](https://huggingface.co/datasets/search_qa) | [paper](https://arxiv.org/abs/1704.05179) | 582,261 |
-| [Eli5](https://huggingface.co/datasets/eli5) | [paper](https://doi.org/10.18653/v1/p19-1346) | 325,475 |
-| [Flickr 30k](https://shannon.cs.illinois.edu/DenotationGraph/) | [paper](https://transacl.org/ojs/index.php/tacl/article/view/229/33) | 317,695 |
-| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (titles) | | 304,525 |
-| AllNLI ([SNLI](https://nlp.stanford.edu/projects/snli/) and [MultiNLI](https://cims.nyu.edu/~sbowman/multinli/) | [paper SNLI](https://doi.org/10.18653/v1/d15-1075), [paper MultiNLI](https://doi.org/10.18653/v1/n18-1101) | 277,230 |
-| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (bodies) | | 250,519 |
-| [Stack Exchange](https://huggingface.co/datasets/flax-sentence-embeddings/stackexchange_xml) Duplicate questions (titles+bodies) | | 250,460 |
-| [Sentence Compression](https://github.com/google-research-datasets/sentence-compression) | [paper](https://www.aclweb.org/anthology/D13-1155/) | 180,000 |
-| [Wikihow](https://github.com/pvl/wikihow_pairs_dataset) | [paper](https://arxiv.org/abs/1810.09305) | 128,542 |
-| [Altlex](https://github.com/chridey/altlex/) | [paper](https://aclanthology.org/P16-1135.pdf) | 112,696 |
-| [Quora Question Triplets](https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs) | - | 103,663 |
-| [Simple Wikipedia](https://cs.pomona.edu/~dkauchak/simplification/) | [paper](https://www.aclweb.org/anthology/P11-2117/) | 102,225 |
-| [Natural Questions (NQ)](https://ai.google.com/research/NaturalQuestions) | [paper](https://transacl.org/ojs/index.php/tacl/article/view/1455) | 100,231 |
-| [SQuAD2.0](https://rajpurkar.github.io/SQuAD-explorer/) | [paper](https://aclanthology.org/P18-2124.pdf) | 87,599 |
-| [TriviaQA](https://huggingface.co/datasets/trivia_qa) | - | 73,346 |
-| **Total** | | **1,170,060,424** |

app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/data_config.json DELETED Viewed

@@ -1,1452 +0,0 @@
-[
-    {
-        "name": "stackexchange_title_body/skeptics.stackexchange.com.jsonl.gz",
-        "lines": 10009,
-        "weight": 1
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/islam.stackexchange.com.jsonl.gz",
-        "lines": 10052,
-        "weight": 1
-    },
-    {
-        "name": "stackexchange_Title_Answer/islam.stackexchange.com.jsonl.gz",
-        "lines": 10052,
-        "weight": 1
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/anime.stackexchange.com.jsonl.gz",
-        "lines": 10131,
-        "weight": 1
-    },
-    {
-        "name": "stackexchange_Title_Answer/anime.stackexchange.com.jsonl.gz",
-        "lines": 10131,
-        "weight": 1
-    },
-    {
-        "name": "stackexchange_title_body/writers.stackexchange.com.jsonl.gz",
-        "lines": 10157,
-        "weight": 1
-    },
-    {
-        "name": "stackexchange_title_body/astronomy.stackexchange.com.jsonl.gz",
-        "lines": 10462,
-        "weight": 1
-    },
-    {
-        "name": "stackexchange_title_body/vi.stackexchange.com.jsonl.gz",
-        "lines": 10551,
-        "weight": 1
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/french.stackexchange.com.jsonl.gz",
-        "lines": 10578,
-        "weight": 1
-    },
-    {
-        "name": "stackexchange_Title_Answer/french.stackexchange.com.jsonl.gz",
-        "lines": 10578,
-        "weight": 1
-    },
-    {
-        "name": "stackexchange_title_body/cstheory.stackexchange.com.jsonl.gz",
-        "lines": 10642,
-        "weight": 1
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/civicrm.stackexchange.com.jsonl.gz",
-        "lines": 10648,
-        "weight": 1
-    },
-    {
-        "name": "stackexchange_Title_Answer/civicrm.stackexchange.com.jsonl.gz",
-        "lines": 10648,
-        "weight": 1
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/expressionengine.stackexchange.com.jsonl.gz",
-        "lines": 10742,
-        "weight": 1
-    },
-    {
-        "name": "stackexchange_Title_Answer/expressionengine.stackexchange.com.jsonl.gz",
-        "lines": 10742,
-        "weight": 1
-    },
-    {
-        "name": "stackexchange_title_body/engineering.stackexchange.com.jsonl.gz",
-        "lines": 10753,
-        "weight": 1
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/history.stackexchange.com.jsonl.gz",
-        "lines": 10766,
-        "weight": 1
-    },
-    {
-        "name": "stackexchange_Title_Answer/history.stackexchange.com.jsonl.gz",
-        "lines": 10766,
-        "weight": 1
-    },
-    {
-        "name": "stackexchange_title_body/french.stackexchange.com.jsonl.gz",
-        "lines": 10794,
-        "weight": 1
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/politics.stackexchange.com.jsonl.gz",
-        "lines": 11047,
-        "weight": 1
-    },
-    {
-        "name": "stackexchange_Title_Answer/politics.stackexchange.com.jsonl.gz",
-        "lines": 11047,
-        "weight": 1
-    },
-    {
-        "name": "stackexchange_title_body/economics.stackexchange.com.jsonl.gz",
-        "lines": 11115,
-        "weight": 1
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/craftcms.stackexchange.com.jsonl.gz",
-        "lines": 11236,
-        "weight": 1
-    },
-    {
-        "name": "stackexchange_Title_Answer/craftcms.stackexchange.com.jsonl.gz",
-        "lines": 11236,
-        "weight": 1
-    },
-    {
-        "name": "stackexchange_title_body/anime.stackexchange.com.jsonl.gz",
-        "lines": 11444,
-        "weight": 1
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/christianity.stackexchange.com.jsonl.gz",
-        "lines": 11498,
-        "weight": 1
-    },
-    {
-        "name": "stackexchange_Title_Answer/christianity.stackexchange.com.jsonl.gz",
-        "lines": 11498,
-        "weight": 1
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/softwarerecs.stackexchange.com.jsonl.gz",
-        "lines": 11761,
-        "weight": 1
-    },
-    {
-        "name": "stackexchange_Title_Answer/softwarerecs.stackexchange.com.jsonl.gz",
-        "lines": 11761,
-        "weight": 1
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/boardgames.stackexchange.com.jsonl.gz",
-        "lines": 11805,
-        "weight": 1
-    },
-    {
-        "name": "stackexchange_Title_Answer/boardgames.stackexchange.com.jsonl.gz",
-        "lines": 11805,
-        "weight": 1
-    },
-    {
-        "name": "stackexchange_title_body/islam.stackexchange.com.jsonl.gz",
-        "lines": 11853,
-        "weight": 1
-    },
-    {
-        "name": "stackexchange_title_body/expressionengine.stackexchange.com.jsonl.gz",
-        "lines": 11866,
-        "weight": 1
-    },
-    {
-        "name": "stackexchange_title_body/politics.stackexchange.com.jsonl.gz",
-        "lines": 11894,
-        "weight": 1
-    },
-    {
-        "name": "stackexchange_title_body/history.stackexchange.com.jsonl.gz",
-        "lines": 12021,
-        "weight": 1
-    },
-    {
-        "name": "stackexchange_title_body/christianity.stackexchange.com.jsonl.gz",
-        "lines": 12108,
-        "weight": 1
-    },
-    {
-        "name": "stackexchange_title_body/boardgames.stackexchange.com.jsonl.gz",
-        "lines": 12149,
-        "weight": 1
-    },
-    {
-        "name": "flickr30k_captions.jsonl.gz",
-        "lines": 317695,
-        "weight": 1
-    },
-    {
-        "name": "coco_captions.jsonl.gz",
-        "lines": 828395,
-        "weight": 1
-    },
-    {
-        "name": "codesearchnet.jsonl.gz",
-        "lines": 1151414,
-        "weight": 1
-    },
-    {
-        "name": "stackexchange_title_body/civicrm.stackexchange.com.jsonl.gz",
-        "lines": 12543,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_title_body/craftcms.stackexchange.com.jsonl.gz",
-        "lines": 12574,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/networkengineering.stackexchange.com.jsonl.gz",
-        "lines": 12590,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_Title_Answer/networkengineering.stackexchange.com.jsonl.gz",
-        "lines": 12590,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/space.stackexchange.com.jsonl.gz",
-        "lines": 12893,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_Title_Answer/space.stackexchange.com.jsonl.gz",
-        "lines": 12893,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/quant.stackexchange.com.jsonl.gz",
-        "lines": 12933,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_Title_Answer/quant.stackexchange.com.jsonl.gz",
-        "lines": 12933,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/philosophy.stackexchange.com.jsonl.gz",
-        "lines": 13114,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_Title_Answer/philosophy.stackexchange.com.jsonl.gz",
-        "lines": 13114,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/gardening.stackexchange.com.jsonl.gz",
-        "lines": 13246,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_Title_Answer/gardening.stackexchange.com.jsonl.gz",
-        "lines": 13246,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_title_body/hinduism.stackexchange.com.jsonl.gz",
-        "lines": 13450,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_title_body/networkengineering.stackexchange.com.jsonl.gz",
-        "lines": 13454,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/german.stackexchange.com.jsonl.gz",
-        "lines": 13733,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_Title_Answer/german.stackexchange.com.jsonl.gz",
-        "lines": 13733,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_title_body/german.stackexchange.com.jsonl.gz",
-        "lines": 13950,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_title_body/philosophy.stackexchange.com.jsonl.gz",
-        "lines": 14829,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_title_body/gardening.stackexchange.com.jsonl.gz",
-        "lines": 15136,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_title_body/space.stackexchange.com.jsonl.gz",
-        "lines": 15142,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/bicycles.stackexchange.com.jsonl.gz",
-        "lines": 15708,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_Title_Answer/bicycles.stackexchange.com.jsonl.gz",
-        "lines": 15708,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/law.stackexchange.com.jsonl.gz",
-        "lines": 16133,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_Title_Answer/law.stackexchange.com.jsonl.gz",
-        "lines": 16133,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/arduino.stackexchange.com.jsonl.gz",
-        "lines": 16281,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_Title_Answer/arduino.stackexchange.com.jsonl.gz",
-        "lines": 16281,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_title_body/bicycles.stackexchange.com.jsonl.gz",
-        "lines": 16353,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/emacs.stackexchange.com.jsonl.gz",
-        "lines": 16830,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_Title_Answer/emacs.stackexchange.com.jsonl.gz",
-        "lines": 16830,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_title_body/quant.stackexchange.com.jsonl.gz",
-        "lines": 17261,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/dsp.stackexchange.com.jsonl.gz",
-        "lines": 17430,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_Title_Answer/dsp.stackexchange.com.jsonl.gz",
-        "lines": 17430,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/puzzling.stackexchange.com.jsonl.gz",
-        "lines": 17448,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_Title_Answer/puzzling.stackexchange.com.jsonl.gz",
-        "lines": 17448,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_title_body/puzzling.stackexchange.com.jsonl.gz",
-        "lines": 17851,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_title_body/law.stackexchange.com.jsonl.gz",
-        "lines": 17941,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/movies.stackexchange.com.jsonl.gz",
-        "lines": 18243,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_Title_Answer/movies.stackexchange.com.jsonl.gz",
-        "lines": 18243,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/mechanics.stackexchange.com.jsonl.gz",
-        "lines": 18613,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_Title_Answer/mechanics.stackexchange.com.jsonl.gz",
-        "lines": 18613,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/aviation.stackexchange.com.jsonl.gz",
-        "lines": 18755,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_Title_Answer/aviation.stackexchange.com.jsonl.gz",
-        "lines": 18755,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/biology.stackexchange.com.jsonl.gz",
-        "lines": 19277,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_Title_Answer/biology.stackexchange.com.jsonl.gz",
-        "lines": 19277,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/crypto.stackexchange.com.jsonl.gz",
-        "lines": 19404,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_Title_Answer/crypto.stackexchange.com.jsonl.gz",
-        "lines": 19404,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_title_body/arduino.stackexchange.com.jsonl.gz",
-        "lines": 19553,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/music.stackexchange.com.jsonl.gz",
-        "lines": 19936,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_Title_Answer/music.stackexchange.com.jsonl.gz",
-        "lines": 19936,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_title_body/aviation.stackexchange.com.jsonl.gz",
-        "lines": 20139,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_title_body/softwarerecs.stackexchange.com.jsonl.gz",
-        "lines": 20142,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_title_body/movies.stackexchange.com.jsonl.gz",
-        "lines": 20181,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/datascience.stackexchange.com.jsonl.gz",
-        "lines": 20503,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_Title_Answer/datascience.stackexchange.com.jsonl.gz",
-        "lines": 20503,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_title_body/music.stackexchange.com.jsonl.gz",
-        "lines": 20636,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/japanese.stackexchange.com.jsonl.gz",
-        "lines": 20948,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_Title_Answer/japanese.stackexchange.com.jsonl.gz",
-        "lines": 20948,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_title_body/emacs.stackexchange.com.jsonl.gz",
-        "lines": 21055,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_title_body/dsp.stackexchange.com.jsonl.gz",
-        "lines": 21252,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_title_body/japanese.stackexchange.com.jsonl.gz",
-        "lines": 22056,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/bitcoin.stackexchange.com.jsonl.gz",
-        "lines": 22474,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_Title_Answer/bitcoin.stackexchange.com.jsonl.gz",
-        "lines": 22474,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/cooking.stackexchange.com.jsonl.gz",
-        "lines": 22641,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_Title_Answer/cooking.stackexchange.com.jsonl.gz",
-        "lines": 22641,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_title_body/mechanics.stackexchange.com.jsonl.gz",
-        "lines": 22868,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/photo.stackexchange.com.jsonl.gz",
-        "lines": 23204,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_Title_Answer/photo.stackexchange.com.jsonl.gz",
-        "lines": 23204,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_title_body/crypto.stackexchange.com.jsonl.gz",
-        "lines": 23231,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_title_body/cooking.stackexchange.com.jsonl.gz",
-        "lines": 23705,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_title_body/photo.stackexchange.com.jsonl.gz",
-        "lines": 23753,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/workplace.stackexchange.com.jsonl.gz",
-        "lines": 24012,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_Title_Answer/workplace.stackexchange.com.jsonl.gz",
-        "lines": 24012,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/meta.stackoverflow.com.jsonl.gz",
-        "lines": 24044,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_Title_Answer/meta.stackoverflow.com.jsonl.gz",
-        "lines": 24044,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/raspberrypi.stackexchange.com.jsonl.gz",
-        "lines": 24143,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_Title_Answer/raspberrypi.stackexchange.com.jsonl.gz",
-        "lines": 24143,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_title_body/workplace.stackexchange.com.jsonl.gz",
-        "lines": 24189,
-        "weight": 2
-    },
-    {
-        "name": "stackexchange_title_body/biology.stackexchange.com.jsonl.gz",
-        "lines": 24447,
-        "weight": 3
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/webapps.stackexchange.com.jsonl.gz",
-        "lines": 24867,
-        "weight": 3
-    },
-    {
-        "name": "stackexchange_Title_Answer/webapps.stackexchange.com.jsonl.gz",
-        "lines": 24867,
-        "weight": 3
-    },
-    {
-        "name": "stackexchange_title_body/bitcoin.stackexchange.com.jsonl.gz",
-        "lines": 25374,
-        "weight": 3
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/judaism.stackexchange.com.jsonl.gz",
-        "lines": 26085,
-        "weight": 3
-    },
-    {
-        "name": "stackexchange_Title_Answer/judaism.stackexchange.com.jsonl.gz",
-        "lines": 26085,
-        "weight": 3
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/ethereum.stackexchange.com.jsonl.gz",
-        "lines": 26124,
-        "weight": 3
-    },
-    {
-        "name": "stackexchange_Title_Answer/ethereum.stackexchange.com.jsonl.gz",
-        "lines": 26124,
-        "weight": 3
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/worldbuilding.stackexchange.com.jsonl.gz",
-        "lines": 26210,
-        "weight": 3
-    },
-    {
-        "name": "stackexchange_Title_Answer/worldbuilding.stackexchange.com.jsonl.gz",
-        "lines": 26210,
-        "weight": 3
-    },
-    {
-        "name": "stackexchange_title_body/worldbuilding.stackexchange.com.jsonl.gz",
-        "lines": 26763,
-        "weight": 3
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/chemistry.stackexchange.com.jsonl.gz",
-        "lines": 27061,
-        "weight": 3
-    },
-    {
-        "name": "stackexchange_Title_Answer/chemistry.stackexchange.com.jsonl.gz",
-        "lines": 27061,
-        "weight": 3
-    },
-    {
-        "name": "stackexchange_title_body/datascience.stackexchange.com.jsonl.gz",
-        "lines": 27397,
-        "weight": 3
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/graphicdesign.stackexchange.com.jsonl.gz",
-        "lines": 28083,
-        "weight": 3
-    },
-    {
-        "name": "stackexchange_Title_Answer/graphicdesign.stackexchange.com.jsonl.gz",
-        "lines": 28083,
-        "weight": 3
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/ux.stackexchange.com.jsonl.gz",
-        "lines": 28901,
-        "weight": 3
-    },
-    {
-        "name": "stackexchange_Title_Answer/ux.stackexchange.com.jsonl.gz",
-        "lines": 28901,
-        "weight": 3
-    },
-    {
-        "name": "stackexchange_title_body/ux.stackexchange.com.jsonl.gz",
-        "lines": 29403,
-        "weight": 3
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/money.stackexchange.com.jsonl.gz",
-        "lines": 29404,
-        "weight": 3
-    },
-    {
-        "name": "stackexchange_Title_Answer/money.stackexchange.com.jsonl.gz",
-        "lines": 29404,
-        "weight": 3
-    },
-    {
-        "name": "stackexchange_title_body/webapps.stackexchange.com.jsonl.gz",
-        "lines": 29697,
-        "weight": 3
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/cs.stackexchange.com.jsonl.gz",
-        "lines": 30010,
-        "weight": 3
-    },
-    {
-        "name": "stackexchange_Title_Answer/cs.stackexchange.com.jsonl.gz",
-        "lines": 30010,
-        "weight": 3
-    },
-    {
-        "name": "stackexchange_title_body/graphicdesign.stackexchange.com.jsonl.gz",
-        "lines": 30233,
-        "weight": 3
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/webmasters.stackexchange.com.jsonl.gz",
-        "lines": 30370,
-        "weight": 3
-    },
-    {
-        "name": "stackexchange_Title_Answer/webmasters.stackexchange.com.jsonl.gz",
-        "lines": 30370,
-        "weight": 3
-    },
-    {
-        "name": "stackexchange_title_body/raspberrypi.stackexchange.com.jsonl.gz",
-        "lines": 30625,
-        "weight": 3
-    },
-    {
-        "name": "stackexchange_title_body/money.stackexchange.com.jsonl.gz",
-        "lines": 32021,
-        "weight": 3
-    },
-    {
-        "name": "stackexchange_title_body/judaism.stackexchange.com.jsonl.gz",
-        "lines": 32028,
-        "weight": 3
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/academia.stackexchange.com.jsonl.gz",
-        "lines": 32137,
-        "weight": 3
-    },
-    {
-        "name": "stackexchange_Title_Answer/academia.stackexchange.com.jsonl.gz",
-        "lines": 32137,
-        "weight": 3
-    },
-    {
-        "name": "stackexchange_title_body/ethereum.stackexchange.com.jsonl.gz",
-        "lines": 32760,
-        "weight": 3
-    },
-    {
-        "name": "stackexchange_title_body/academia.stackexchange.com.jsonl.gz",
-        "lines": 34331,
-        "weight": 3
-    },
-    {
-        "name": "stackexchange_title_body/chemistry.stackexchange.com.jsonl.gz",
-        "lines": 34506,
-        "weight": 3
-    },
-    {
-        "name": "stackexchange_title_body/webmasters.stackexchange.com.jsonl.gz",
-        "lines": 34559,
-        "weight": 3
-    },
-    {
-        "name": "stackexchange_title_body/meta.stackoverflow.com.jsonl.gz",
-        "lines": 36456,
-        "weight": 3
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/travel.stackexchange.com.jsonl.gz",
-        "lines": 36533,
-        "weight": 4
-    },
-    {
-        "name": "stackexchange_Title_Answer/travel.stackexchange.com.jsonl.gz",
-        "lines": 36533,
-        "weight": 4
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/android.stackexchange.com.jsonl.gz",
-        "lines": 38077,
-        "weight": 4
-    },
-    {
-        "name": "stackexchange_Title_Answer/android.stackexchange.com.jsonl.gz",
-        "lines": 38077,
-        "weight": 4
-    },
-    {
-        "name": "stackexchange_title_body/cs.stackexchange.com.jsonl.gz",
-        "lines": 38314,
-        "weight": 4
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/gamedev.stackexchange.com.jsonl.gz",
-        "lines": 40154,
-        "weight": 4
-    },
-    {
-        "name": "stackexchange_Title_Answer/gamedev.stackexchange.com.jsonl.gz",
-        "lines": 40154,
-        "weight": 4
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/rpg.stackexchange.com.jsonl.gz",
-        "lines": 40435,
-        "weight": 4
-    },
-    {
-        "name": "stackexchange_Title_Answer/rpg.stackexchange.com.jsonl.gz",
-        "lines": 40435,
-        "weight": 4
-    },
-    {
-        "name": "stackexchange_title_body/travel.stackexchange.com.jsonl.gz",
-        "lines": 41227,
-        "weight": 4
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/codereview.stackexchange.com.jsonl.gz",
-        "lines": 41748,
-        "weight": 4
-    },
-    {
-        "name": "stackexchange_Title_Answer/codereview.stackexchange.com.jsonl.gz",
-        "lines": 41748,
-        "weight": 4
-    },
-    {
-        "name": "stackexchange_title_body/rpg.stackexchange.com.jsonl.gz",
-        "lines": 42303,
-        "weight": 4
-    },
-    {
-        "name": "stackexchange_title_body/codereview.stackexchange.com.jsonl.gz",
-        "lines": 45765,
-        "weight": 4
-    },
-    {
-        "name": "stackexchange_title_body/gamedev.stackexchange.com.jsonl.gz",
-        "lines": 46485,
-        "weight": 4
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/softwareengineering.stackexchange.com.jsonl.gz",
-        "lines": 51326,
-        "weight": 5
-    },
-    {
-        "name": "stackexchange_Title_Answer/softwareengineering.stackexchange.com.jsonl.gz",
-        "lines": 51326,
-        "weight": 5
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/security.stackexchange.com.jsonl.gz",
-        "lines": 51355,
-        "weight": 5
-    },
-    {
-        "name": "stackexchange_Title_Answer/security.stackexchange.com.jsonl.gz",
-        "lines": 51355,
-        "weight": 5
-    },
-    {
-        "name": "stackexchange_title_body/android.stackexchange.com.jsonl.gz",
-        "lines": 51608,
-        "weight": 5
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/diy.stackexchange.com.jsonl.gz",
-        "lines": 52896,
-        "weight": 5
-    },
-    {
-        "name": "stackexchange_Title_Answer/diy.stackexchange.com.jsonl.gz",
-        "lines": 52896,
-        "weight": 5
-    },
-    {
-        "name": "stackexchange_title_body/softwareengineering.stackexchange.com.jsonl.gz",
-        "lines": 53942,
-        "weight": 5
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/blender.stackexchange.com.jsonl.gz",
-        "lines": 54153,
-        "weight": 5
-    },
-    {
-        "name": "stackexchange_Title_Answer/blender.stackexchange.com.jsonl.gz",
-        "lines": 54153,
-        "weight": 5
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/scifi.stackexchange.com.jsonl.gz",
-        "lines": 54805,
-        "weight": 5
-    },
-    {
-        "name": "stackexchange_Title_Answer/scifi.stackexchange.com.jsonl.gz",
-        "lines": 54805,
-        "weight": 5
-    },
-    {
-        "name": "stackexchange_title_body/security.stackexchange.com.jsonl.gz",
-        "lines": 58000,
-        "weight": 5
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/mathematica.stackexchange.com.jsonl.gz",
-        "lines": 59895,
-        "weight": 5
-    },
-    {
-        "name": "stackexchange_Title_Answer/mathematica.stackexchange.com.jsonl.gz",
-        "lines": 59895,
-        "weight": 5
-    },
-    {
-        "name": "stackexchange_title_body/diy.stackexchange.com.jsonl.gz",
-        "lines": 60083,
-        "weight": 5
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/meta.stackexchange.com.jsonl.gz",
-        "lines": 60744,
-        "weight": 5
-    },
-    {
-        "name": "stackexchange_Title_Answer/meta.stackexchange.com.jsonl.gz",
-        "lines": 60744,
-        "weight": 5
-    },
-    {
-        "name": "stackexchange_title_body/scifi.stackexchange.com.jsonl.gz",
-        "lines": 61528,
-        "weight": 6
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/drupal.stackexchange.com.jsonl.gz",
-        "lines": 67817,
-        "weight": 6
-    },
-    {
-        "name": "stackexchange_Title_Answer/drupal.stackexchange.com.jsonl.gz",
-        "lines": 67817,
-        "weight": 6
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/dba.stackexchange.com.jsonl.gz",
-        "lines": 71449,
-        "weight": 6
-    },
-    {
-        "name": "stackexchange_Title_Answer/dba.stackexchange.com.jsonl.gz",
-        "lines": 71449,
-        "weight": 6
-    },
-    {
-        "name": "stackexchange_title_body/mathematica.stackexchange.com.jsonl.gz",
-        "lines": 73131,
-        "weight": 7
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/ell.stackexchange.com.jsonl.gz",
-        "lines": 77892,
-        "weight": 7
-    },
-    {
-        "name": "stackexchange_Title_Answer/ell.stackexchange.com.jsonl.gz",
-        "lines": 77892,
-        "weight": 7
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/magento.stackexchange.com.jsonl.gz",
-        "lines": 79241,
-        "weight": 7
-    },
-    {
-        "name": "stackexchange_Title_Answer/magento.stackexchange.com.jsonl.gz",
-        "lines": 79241,
-        "weight": 7
-    },
-    {
-        "name": "stackexchange_title_body/drupal.stackexchange.com.jsonl.gz",
-        "lines": 79717,
-        "weight": 7
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/sharepoint.stackexchange.com.jsonl.gz",
-        "lines": 80420,
-        "weight": 7
-    },
-    {
-        "name": "stackexchange_Title_Answer/sharepoint.stackexchange.com.jsonl.gz",
-        "lines": 80420,
-        "weight": 7
-    },
-    {
-        "name": "stackexchange_title_body/blender.stackexchange.com.jsonl.gz",
-        "lines": 80766,
-        "weight": 7
-    },
-    {
-        "name": "stackexchange_title_body/dba.stackexchange.com.jsonl.gz",
-        "lines": 81871,
-        "weight": 7
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/gaming.stackexchange.com.jsonl.gz",
-        "lines": 82887,
-        "weight": 7
-    },
-    {
-        "name": "stackexchange_Title_Answer/gaming.stackexchange.com.jsonl.gz",
-        "lines": 82887,
-        "weight": 7
-    },
-    {
-        "name": "stackexchange_title_body/ell.stackexchange.com.jsonl.gz",
-        "lines": 83271,
-        "weight": 7
-    },
-    {
-        "name": "stackexchange_title_body/meta.stackexchange.com.jsonl.gz",
-        "lines": 83510,
-        "weight": 7
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/wordpress.stackexchange.com.jsonl.gz",
-        "lines": 83621,
-        "weight": 7
-    },
-    {
-        "name": "stackexchange_Title_Answer/wordpress.stackexchange.com.jsonl.gz",
-        "lines": 83621,
-        "weight": 7
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/mathoverflow.net.jsonl.gz",
-        "lines": 85289,
-        "weight": 8
-    },
-    {
-        "name": "stackexchange_Title_Answer/mathoverflow.net.jsonl.gz",
-        "lines": 85289,
-        "weight": 8
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/salesforce.stackexchange.com.jsonl.gz",
-        "lines": 87272,
-        "weight": 8
-    },
-    {
-        "name": "stackexchange_Title_Answer/salesforce.stackexchange.com.jsonl.gz",
-        "lines": 87272,
-        "weight": 8
-    },
-    {
-        "name": "stackexchange_title_body/gaming.stackexchange.com.jsonl.gz",
-        "lines": 88912,
-        "weight": 8
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/apple.stackexchange.com.jsonl.gz",
-        "lines": 92487,
-        "weight": 8
-    },
-    {
-        "name": "stackexchange_Title_Answer/apple.stackexchange.com.jsonl.gz",
-        "lines": 92487,
-        "weight": 8
-    },
-    {
-        "name": "stackexchange_title_body/sharepoint.stackexchange.com.jsonl.gz",
-        "lines": 94011,
-        "weight": 8
-    },
-    {
-        "name": "stackexchange_title_body/magento.stackexchange.com.jsonl.gz",
-        "lines": 99991,
-        "weight": 9
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/gis.stackexchange.com.jsonl.gz",
-        "lines": 100254,
-        "weight": 9
-    },
-    {
-        "name": "stackexchange_Title_Answer/gis.stackexchange.com.jsonl.gz",
-        "lines": 100254,
-        "weight": 9
-    },
-    {
-        "name": "stackexchange_title_body/wordpress.stackexchange.com.jsonl.gz",
-        "lines": 100474,
-        "weight": 9
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/english.stackexchange.com.jsonl.gz",
-        "lines": 100640,
-        "weight": 9
-    },
-    {
-        "name": "stackexchange_Title_Answer/english.stackexchange.com.jsonl.gz",
-        "lines": 100640,
-        "weight": 9
-    },
-    {
-        "name": "stackexchange_title_body/salesforce.stackexchange.com.jsonl.gz",
-        "lines": 105260,
-        "weight": 9
-    },
-    {
-        "name": "stackexchange_title_body/english.stackexchange.com.jsonl.gz",
-        "lines": 109522,
-        "weight": 10
-    },
-    {
-        "name": "stackexchange_title_body/apple.stackexchange.com.jsonl.gz",
-        "lines": 110622,
-        "weight": 10
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/stats.stackexchange.com.jsonl.gz",
-        "lines": 115679,
-        "weight": 10
-    },
-    {
-        "name": "stackexchange_Title_Answer/stats.stackexchange.com.jsonl.gz",
-        "lines": 115679,
-        "weight": 10
-    },
-    {
-        "name": "stackexchange_title_body/mathoverflow.net.jsonl.gz",
-        "lines": 120851,
-        "weight": 10
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/electronics.stackexchange.com.jsonl.gz",
-        "lines": 129494,
-        "weight": 11
-    },
-    {
-        "name": "stackexchange_Title_Answer/electronics.stackexchange.com.jsonl.gz",
-        "lines": 129494,
-        "weight": 11
-    },
-    {
-        "name": "stackexchange_title_body/gis.stackexchange.com.jsonl.gz",
-        "lines": 131000,
-        "weight": 11
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/physics.stackexchange.com.jsonl.gz",
-        "lines": 141230,
-        "weight": 12
-    },
-    {
-        "name": "stackexchange_Title_Answer/physics.stackexchange.com.jsonl.gz",
-        "lines": 141230,
-        "weight": 12
-    },
-    {
-        "name": "stackexchange_title_body/electronics.stackexchange.com.jsonl.gz",
-        "lines": 143582,
-        "weight": 12
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/unix.stackexchange.com.jsonl.gz",
-        "lines": 155414,
-        "weight": 13
-    },
-    {
-        "name": "stackexchange_Title_Answer/unix.stackexchange.com.jsonl.gz",
-        "lines": 155414,
-        "weight": 13
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/tex.stackexchange.com.jsonl.gz",
-        "lines": 171628,
-        "weight": 15
-    },
-    {
-        "name": "stackexchange_Title_Answer/tex.stackexchange.com.jsonl.gz",
-        "lines": 171628,
-        "weight": 15
-    },
-    {
-        "name": "stackexchange_title_body/physics.stackexchange.com.jsonl.gz",
-        "lines": 173307,
-        "weight": 15
-    },
-    {
-        "name": "stackexchange_title_body/stats.stackexchange.com.jsonl.gz",
-        "lines": 173466,
-        "weight": 15
-    },
-    {
-        "name": "stackexchange_title_body/unix.stackexchange.com.jsonl.gz",
-        "lines": 185997,
-        "weight": 16
-    },
-    {
-        "name": "stackexchange_title_body/tex.stackexchange.com.jsonl.gz",
-        "lines": 202954,
-        "weight": 17
-    },
-    {
-        "name": "TriviaQA_pairs.jsonl.gz",
-        "lines": 73346,
-        "weight": 19
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/serverfault.com.jsonl.gz",
-        "lines": 238507,
-        "weight": 20
-    },
-    {
-        "name": "stackexchange_Title_Answer/serverfault.com.jsonl.gz",
-        "lines": 238507,
-        "weight": 20
-    },
-    {
-        "name": "stackexchange_duplicate_questions_title-body_title-body.jsonl.gz",
-        "lines": 250460,
-        "weight": 21
-    },
-    {
-        "name": "stackexchange_duplicate_questions_body_body.jsonl.gz",
-        "lines": 250519,
-        "weight": 21
-    },
-    {
-        "name": "squad_pairs.jsonl.gz",
-        "lines": 87599,
-        "weight": 22
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/askubuntu.com.jsonl.gz",
-        "lines": 267135,
-        "weight": 22
-    },
-    {
-        "name": "stackexchange_Title_Answer/askubuntu.com.jsonl.gz",
-        "lines": 267135,
-        "weight": 22
-    },
-    {
-        "name": "stackexchange_title_body/serverfault.com.jsonl.gz",
-        "lines": 270904,
-        "weight": 23
-    },
-    {
-        "name": "NQ-train_pairs.jsonl.gz",
-        "lines": 100231,
-        "weight": 25
-    },
-    {
-        "name": "SimpleWiki.jsonl.gz",
-        "lines": 102225,
-        "weight": 26
-    },
-    {
-        "name": "quora_duplicates_triplets.jsonl.gz",
-        "lines": 103663,
-        "weight": 26
-    },
-    {
-        "name": "stackexchange_duplicate_questions_title_title.jsonl.gz",
-        "lines": 304525,
-        "weight": 26
-    },
-    {
-        "name": "altlex.jsonl.gz",
-        "lines": 112696,
-        "weight": 28
-    },
-    {
-        "name": "stackexchange_title_body/askubuntu.com.jsonl.gz",
-        "lines": 347925,
-        "weight": 29
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/superuser.com.jsonl.gz",
-        "lines": 352610,
-        "weight": 30
-    },
-    {
-        "name": "stackexchange_Title_Answer/superuser.com.jsonl.gz",
-        "lines": 352610,
-        "weight": 30
-    },
-    {
-        "name": "wikihow.jsonl.gz",
-        "lines": 128542,
-        "weight": 32
-    },
-    {
-        "name": "stackexchange_title_body/superuser.com.jsonl.gz",
-        "lines": 435463,
-        "weight": 36
-    },
-    {
-        "name": "stackexchange_title_body/small_stackexchanges.jsonl.gz",
-        "lines": 448146,
-        "weight": 37
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/small_stackexchanges.jsonl.gz",
-        "lines": 460256,
-        "weight": 38
-    },
-    {
-        "name": "stackexchange_Title_Answer/small_stackexchanges.jsonl.gz",
-        "lines": 460256,
-        "weight": 38
-    },
-    {
-        "name": "sentence-compression.jsonl.gz",
-        "lines": 180000,
-        "weight": 45
-    },
-    {
-        "name": "AllNLI.jsonl.gz",
-        "lines": 277230,
-        "weight": 69
-    },
-    {
-        "name": "eli5_question_answer.jsonl.gz",
-        "lines": 325475,
-        "weight": 81
-    },
-    {
-        "name": "reddit/reddit_2015.jsonl.gz",
-        "lines": 135108166,
-        "weight": 82
-    },
-    {
-        "name": "reddit/reddit_2016.jsonl.gz",
-        "lines": 159164386,
-        "weight": 82
-    },
-    {
-        "name": "reddit/reddit_2017.jsonl.gz",
-        "lines": 191485219,
-        "weight": 82
-    },
-    {
-        "name": "reddit/reddit_2018.jsonl.gz",
-        "lines": 240726659,
-        "weight": 82
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/math.stackexchange.com.jsonl.gz",
-        "lines": 1100953,
-        "weight": 83
-    },
-    {
-        "name": "stackexchange_Title_Answer/math.stackexchange.com.jsonl.gz",
-        "lines": 1100953,
-        "weight": 83
-    },
-    {
-        "name": "stackexchange_title_body/math.stackexchange.com.jsonl.gz",
-        "lines": 1338443,
-        "weight": 83
-    },
-    {
-        "name": "stackexchange_TitleBody_Answer/stackoverflow.com-Posts.jsonl.gz",
-        "lines": 15768211,
-        "weight": 83
-    },
-    {
-        "name": "stackexchange_Title_Answer/stackoverflow.com-Posts.jsonl.gz",
-        "lines": 15768211,
-        "weight": 83
-    },
-    {
-        "name": "stackexchange_title_body/stackoverflow.com-Posts.jsonl.gz",
-        "lines": 18562443,
-        "weight": 83
-    },
-    {
-        "name": "specter_train_triples.jsonl.gz",
-        "lines": 684100,
-        "weight": 84
-    },
-    {
-        "name": "S2ORC_title_abstract.jsonl.gz",
-        "lines": 41769185,
-        "weight": 123
-    },
-    {
-        "name": "S2ORC_citation_pairs.jsonl.gz",
-        "lines": 52603982,
-        "weight": 123
-    },
-    {
-        "name": "PAQ_pairs.jsonl.gz",
-        "lines": 64371441,
-        "weight": 123
-    },
-    {
-        "name": "WikiAnswers_pairs.jsonl.gz",
-        "lines": 77427422,
-        "weight": 123
-    },
-    {
-        "name": "S2ORC_citation_pairs_abstract.jsonl.gz",
-        "lines": 116288806,
-        "weight": 123
-    },
-    {
-        "name": "searchQA_question_top5_snippets_merged.jsonl.gz",
-        "lines": 582261,
-        "weight": 144
-    },
-    {
-        "name": "yahoo_answers_title_question.jsonl.gz",
-        "lines": 659896,
-        "weight": 163
-    },
-    {
-        "name": "yahoo_answers_question_answer.jsonl.gz",
-        "lines": 681164,
-        "weight": 169
-    },
-    {
-        "name": "yahoo_answers_title_answer.jsonl.gz",
-        "lines": 1198260,
-        "weight": 247
-    },
-    {
-        "name": "amazon-qa-train-pairs.jsonl.gz",
-        "lines": 2448839,
-        "weight": 247
-    },
-    {
-        "name": "gooaq_pairs.jsonl.gz",
-        "lines": 3012496,
-        "weight": 247
-    },
-    {
-        "name": "msmarco-query_passage_negative.jsonl.gz",
-        "lines": 9144553,
-        "weight": 247
-    }
-]

app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/model.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db
-size 90868376

app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/onnx/model.onnx DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6fd5d72fe4589f189f8ebc006442dbb529bb7ce38f8082112682524616046452
-size 90405214

app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/onnx/model_O1.onnx DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1391c6fc20b5530250bc15cbe1f47578ffeca55ab0551d335cc668b6299a88ec
-size 90360328

app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/onnx/model_O2.onnx DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1de3905029190b398c7d300b530e320cf4b5e7d3dfb9af1429ebd73fd9a16faf
-size 90326566

app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/onnx/model_O3.onnx DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a44f671e364dddbac31f203f07b91be6b0a35e51936e5ebfab65b6d9538b83ff
-size 90326497

app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/onnx/model_O4.onnx DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1667d7f3ba669048b13a96ee3a44456d5e42c8f44588ae8b603430e16160c485
-size 45212349

app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/onnx/model_qint8_arm64.onnx DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4278337fd0ff3c68bfb6291042cad8ab363e1d9fbc43dcb499fe91c871902474
-size 23026053

app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/onnx/model_qint8_avx512.onnx DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4278337fd0ff3c68bfb6291042cad8ab363e1d9fbc43dcb499fe91c871902474
-size 23026053

app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/onnx/model_qint8_avx512_vnni.onnx DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4278337fd0ff3c68bfb6291042cad8ab363e1d9fbc43dcb499fe91c871902474
-size 23026053

app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/onnx/model_quint8_avx2.onnx DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b941bf19f1f1283680f449fa6a7336bb5600bdcd5f84d10ddc5cd72218a0fd21
-size 23046789

app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/openvino/openvino_model.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8b86cab4722e2aefab310cf96d4d5a9eb3b187f7d9670a082afc55c7fa0d392a
-size 90265744

app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/openvino/openvino_model.xml DELETED Viewed

The diff for this file is too large to render. See raw diff

app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/openvino/openvino_model_qint8_quantized.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c92ea4af3c6bc7b4a0f3b3d61b147c850f4dbdd7c9e7beee0c0c70dc12da289b
-size 22933664

app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/openvino/openvino_model_qint8_quantized.xml DELETED Viewed

The diff for this file is too large to render. See raw diff

app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/pytorch_model.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c3a85f238711653950f6a79ece63eb0ea93d76f6a6284be04019c53733baf256
-size 90888945

app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/tokenizer.json DELETED Viewed

The diff for this file is too large to render. See raw diff

app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/train_script.py DELETED Viewed

@@ -1,344 +0,0 @@
-"""
-Train script for a single file
-Need to set the TPU address first:
-export XRT_TPU_CONFIG="localservice;0;localhost:51011"
-"""
-import torch.multiprocessing as mp
-import threading
-import time
-import random
-import sys
-import argparse
-import gzip
-import json
-import logging
-import tqdm
-import torch
-from torch import nn
-from torch.utils.data import DataLoader
-import torch
-import torch_xla
-import torch_xla.core
-import torch_xla.core.functions
-import torch_xla.core.xla_model as xm
-import torch_xla.distributed.xla_multiprocessing as xmp
-import torch_xla.distributed.parallel_loader as pl
-import os
-from shutil import copyfile
-from transformers import (
-    AdamW,
-    AutoModel,
-    AutoTokenizer,
-    get_linear_schedule_with_warmup,
-    set_seed,
-)
-class AutoModelForSentenceEmbedding(nn.Module):
-    def __init__(self, model_name, tokenizer, normalize=True):
-        super(AutoModelForSentenceEmbedding, self).__init__()
-        self.model = AutoModel.from_pretrained(model_name)
-        self.normalize = normalize
-        self.tokenizer = tokenizer
-    def forward(self, **kwargs):
-        model_output = self.model(**kwargs)
-        embeddings = self.mean_pooling(model_output, kwargs['attention_mask'])
-        if self.normalize:
-            embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
-        return embeddings
-    def mean_pooling(self, model_output, attention_mask):
-        token_embeddings = model_output[0]  # First element of model_output contains all token embeddings
-        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
-        return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
-    def save_pretrained(self, output_path):
-        if xm.is_master_ordinal():
-            self.tokenizer.save_pretrained(output_path)
-            self.model.config.save_pretrained(output_path)
-        xm.save(self.model.state_dict(), os.path.join(output_path, "pytorch_model.bin"))
-def train_function(index, args, queue):
-    tokenizer = AutoTokenizer.from_pretrained(args.model)
-    model = AutoModelForSentenceEmbedding(args.model, tokenizer)
-    ### Train Loop
-    device = xm.xla_device()
-    model = model.to(device)
-    # Instantiate optimizer
-    optimizer = AdamW(params=model.parameters(), lr=2e-5, correct_bias=True)
-    lr_scheduler = get_linear_schedule_with_warmup(
-        optimizer=optimizer,
-        num_warmup_steps=500,
-        num_training_steps=args.steps,
-    )
-    # Now we train the model
-    cross_entropy_loss = nn.CrossEntropyLoss()
-    max_grad_norm = 1
-    model.train()
-    for global_step in tqdm.trange(args.steps, disable=not xm.is_master_ordinal()):
-        #### Get the batch data
-        batch = queue.get()
-        #print(index, "batch {}x{}".format(len(batch), ",".join([str(len(b)) for b in batch])))
-        if len(batch[0]) == 2: #(anchor, positive)
-            text1 = tokenizer([b[0] for b in batch], return_tensors="pt", max_length=args.max_length, truncation=True, padding="max_length")
-            text2 = tokenizer([b[1] for b in batch], return_tensors="pt", max_length=args.max_length, truncation=True, padding="max_length")
-            ### Compute embeddings
-            embeddings_a = model(**text1.to(device))
-            embeddings_b = model(**text2.to(device))
-            ### Gather all embedings
-            embeddings_a = torch_xla.core.functions.all_gather(embeddings_a)
-            embeddings_b = torch_xla.core.functions.all_gather(embeddings_b)
-            ### Compute similarity scores 512 x 512
-            scores = torch.mm(embeddings_a, embeddings_b.transpose(0, 1)) * args.scale
-            ### Compute cross-entropy loss
-            labels = torch.tensor(range(len(scores)), dtype=torch.long, device=embeddings_a.device)  # Example a[i] should match with b[i]
-            ## Symmetric loss as in CLIP
-            loss = (cross_entropy_loss(scores, labels) + cross_entropy_loss(scores.transpose(0, 1), labels)) / 2
-        else:   #(anchor, positive, negative)
-            text1 = tokenizer([b[0] for b in batch], return_tensors="pt", max_length=args.max_length, truncation=True, padding="max_length")
-            text2 = tokenizer([b[1] for b in batch], return_tensors="pt", max_length=args.max_length, truncation=True, padding="max_length")
-            text3 = tokenizer([b[2] for b in batch], return_tensors="pt", max_length=args.max_length, truncation=True, padding="max_length")
-            embeddings_a  = model(**text1.to(device))
-            embeddings_b1 = model(**text2.to(device))
-            embeddings_b2 = model(**text3.to(device))
-            embeddings_a  = torch_xla.core.functions.all_gather(embeddings_a)
-            embeddings_b1 = torch_xla.core.functions.all_gather(embeddings_b1)
-            embeddings_b2 = torch_xla.core.functions.all_gather(embeddings_b2)
-            embeddings_b = torch.cat([embeddings_b1, embeddings_b2])
-            ### Compute similarity scores 512 x 1024
-            scores = torch.mm(embeddings_a, embeddings_b.transpose(0, 1)) * args.scale
-            ### Compute cross-entropy loss
-            labels = torch.tensor(range(len(scores)), dtype=torch.long, device=embeddings_a.device)  # Example a[i] should match with b[i]
-            ## One-way loss
-            loss = cross_entropy_loss(scores, labels)
-        # Backward pass
-        optimizer.zero_grad()
-        loss.backward()
-        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
-        xm.optimizer_step(optimizer, barrier=True)
-        lr_scheduler.step()
-        #Save model
-        if (global_step+1) % args.save_steps == 0:
-            output_path = os.path.join(args.output, str(global_step+1))
-            xm.master_print("save model: "+output_path)
-            model.save_pretrained(output_path)
-    output_path = os.path.join(args.output, "final")
-    xm.master_print("save model final: "+ output_path)
-    model.save_pretrained(output_path)
-def produce_data(args, queue, filepaths, dataset_indices):
-    global_batch_size = args.batch_size*args.nprocs    #Global batch size
-    size_per_dataset = int(global_batch_size / args.datasets_per_batch)    #How many datasets per batch
-    num_same_dataset = int(size_per_dataset / args.batch_size)
-    print("producer", "global_batch_size", global_batch_size)
-    print("producer", "size_per_dataset", size_per_dataset)
-    print("producer", "num_same_dataset", num_same_dataset)
-    datasets = []
-    for filepath in filepaths:
-        if "reddit_" in filepath:       #Special dataset class for Reddit files
-            data_obj = RedditDataset(filepath)
-        else:
-            data_obj = Dataset(filepath)
-        datasets.append(iter(data_obj))
-    # Store if dataset is in a 2 col or 3 col format
-    num_cols = {idx: len(next(dataset)) for idx, dataset in enumerate(datasets)}
-    while True:
-        texts_in_batch = set()
-        batch_format = None     #2 vs 3 col format for this batch
-        #Add data from several sub datasets
-        for _ in range(args.datasets_per_batch):
-            valid_dataset = False   #Check that datasets have the same 2/3 col format
-            while not valid_dataset:
-                data_idx = random.choice(dataset_indices)
-                if batch_format is None:
-                    batch_format = num_cols[data_idx]
-                    valid_dataset = True
-                else:   #Check that this dataset has the same format
-                    valid_dataset = (batch_format == num_cols[data_idx])
-            #Get data from this dataset
-            dataset = datasets[data_idx]
-            for _ in range(num_same_dataset):
-                for _ in range(args.nprocs):
-                    batch_device = []   #A batch for one device
-                    while len(batch_device) < args.batch_size:
-                        sample = next(dataset)
-                        in_batch = False
-                        for text in sample:
-                            if text in texts_in_batch:
-                                in_batch = True
-                                break
-                        if not in_batch:
-                            for text in sample:
-                                texts_in_batch.add(text)
-                            batch_device.append(sample)
-                    queue.put(batch_device)
-class RedditDataset:
-    """
-    A class that handles the reddit data files
-    """
-    def __init__(self, filepath):
-        self.filepath = filepath
-    def __iter__(self):
-        while True:
-            with gzip.open(self.filepath, "rt") as fIn:
-                    for line in fIn:
-                        data = json.loads(line)
-                        if "response" in data and "context" in data:
-                            yield [data["response"], data["context"]]
-class Dataset:
-    """
-    A class that handles one dataset
-    """
-    def __init__(self, filepath):
-        self.filepath = filepath
-    def __iter__(self):
-        max_dataset_size = 10*1000*1000    #Cache small datasets in memory
-        dataset = []
-        data_format = None
-        while dataset is None or len(dataset) == 0:
-            with gzip.open(self.filepath, "rt") as fIn:
-                for line in fIn:
-                    data = json.loads(line)
-                    if isinstance(data, dict):
-                        data = data['texts']
-                    if data_format is None:
-                        data_format = len(data)
-                    #Ensure that all entries are of the same 2/3 col format
-                    assert len(data) == data_format
-                    if dataset is not None:
-                        dataset.append(data)
-                        if len(dataset) >= max_dataset_size:
-                            dataset = None
-                    yield data
-        # Data loaded. Now stream to the queue
-        # Shuffle for each epoch
-        while True:
-            random.shuffle(dataset)
-            for data in dataset:
-                yield data
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--model', default='nreimers/MiniLM-L6-H384-uncased')
-    parser.add_argument('--steps', type=int, default=2000)
-    parser.add_argument('--save_steps', type=int, default=10000)
-    parser.add_argument('--batch_size', type=int, default=64)
-    parser.add_argument('--max_length', type=int, default=128)
-    parser.add_argument('--nprocs', type=int, default=8)
-    parser.add_argument('--datasets_per_batch', type=int, default=2, help="Number of datasets per batch")
-    parser.add_argument('--scale', type=float, default=20, help="Use 20 for cossim, and 1 when you work with unnormalized embeddings with dot product")
-    parser.add_argument('--data_folder', default="/data", help="Folder with your dataset files")
-    parser.add_argument('data_config', help="A data_config.json file")
-    parser.add_argument('output')
-    args = parser.parse_args()
-    # Ensure global batch size is divisble by data_sample_size
-    assert (args.batch_size*args.nprocs) % args.datasets_per_batch == 0
-    logging.info("Output: "+args.output)
-    if os.path.exists(args.output):
-        print("Output folder already exists.")
-        input("Continue?")
-    # Write train script to output path
-    os.makedirs(args.output, exist_ok=True)
-    data_config_path = os.path.join(args.output, 'data_config.json')
-    copyfile(args.data_config, data_config_path)
-    train_script_path = os.path.join(args.output, 'train_script.py')
-    copyfile(__file__, train_script_path)
-    with open(train_script_path, 'a') as fOut:
-        fOut.write("\n\n# Script was called via:\n#python " + " ".join(sys.argv))
-    #Load data config
-    with open(args.data_config) as fIn:
-        data_config = json.load(fIn)
-    queue = mp.Queue(maxsize=100*args.nprocs)
-    filepaths = []
-    dataset_indices = []
-    for idx, data in enumerate(data_config):
-        filepaths.append(os.path.join(os.path.expanduser(args.data_folder), data['name']))
-        dataset_indices.extend([idx]*data['weight'])
-    # Start producer
-    p = mp.Process(target=produce_data, args=(args, queue, filepaths, dataset_indices))
-    p.start()
-    # Run training
-    print("Start processes:", args.nprocs)
-    xmp.spawn(train_function, args=(args, queue), nprocs=args.nprocs, start_method='fork')
-    print("Training done")
-    print("It might be that not all processes exit automatically. In that case you must manually kill this process.")
-    print("With 'pkill python' you can kill all remaining python processes")
-    p.kill()
-    exit()
-# Script was called via:
-#python train_many_data_files_v2.py --steps 1000000 --batch_size 128 --model nreimers/MiniLM-L6-H384-uncased train_data_configs/all_datasets_v4.json output/all_datasets_v4_MiniLM-L6-H384-uncased-batch128

app/ml/trainning/models/sentence-transformers_all-MiniLM-L6-v2/vocab.txt DELETED Viewed

The diff for this file is too large to render. See raw diff

models/models--sentence-transformers--all-MiniLM-L6-v2/blobs/53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:53aa51172d142c89d9012cce15ae4d6cc0ca6895895114379cacb4fab128d9db
-size 90868376

models/models--sentence-transformers--all-MiniLM-L6-v2/blobs/58d4a9a45664eb9e12de9549c548c09b6134c17f DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7dfc82496ec33f906b5b0d6750c1e2397da6530c74d1ae3568c55bc2739125e7
-size 10454

models/models--sentence-transformers--all-MiniLM-L6-v2/blobs/cb202bfe2e3c98645018a6d12f182a434c9d3e02 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:be50c3628f2bf5bb5e3a7f17b1f74611b2561a3a27eeab05e5aa30f411572037
-size 466247

models/models--sentence-transformers--all-MiniLM-L6-v2/blobs/fb140275c155a9c7c5a3b3e0e77a9e839594a938 DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:07eced375cec144d27c900241f3e339478dec958f92fddbc551f295c992038a3
-size 231508

models/sentence-transformers_all-MiniLM-L6-v2/README.md DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7dfc82496ec33f906b5b0d6750c1e2397da6530c74d1ae3568c55bc2739125e7
-size 10454

models/sentence-transformers_all-MiniLM-L6-v2/onnx/model_qint8_avx512.onnx DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4278337fd0ff3c68bfb6291042cad8ab363e1d9fbc43dcb499fe91c871902474
-size 23026053

models/sentence-transformers_all-MiniLM-L6-v2/onnx/model_qint8_avx512_vnni.onnx DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4278337fd0ff3c68bfb6291042cad8ab363e1d9fbc43dcb499fe91c871902474
-size 23026053