| | """debug_downloaded_data.py - Inspect the downloaded conversation format"""
|
| |
|
| | import json
|
| |
|
| | def inspect_downloaded_data():
|
| | """Inspect the first few records to understand the format"""
|
| |
|
| | data_path = "data/conversation_raw/OpenAssistant_oasst1_raw.jsonl"
|
| |
|
| | print("🔍 Inspecting downloaded OpenAssistant data...")
|
| | print("="*50)
|
| |
|
| | try:
|
| | with open(data_path, 'r', encoding='utf-8') as f:
|
| | for i in range(5):
|
| | line = f.readline().strip()
|
| | if line:
|
| | record = json.loads(line)
|
| | print(f"\nRecord {i+1}:")
|
| | print(f"Top-level keys: {list(record.keys())}")
|
| |
|
| |
|
| | for key, value in record.items():
|
| | if isinstance(value, str) and len(value) > 100:
|
| | value = value[:100] + "..."
|
| | elif isinstance(value, dict):
|
| | value = f"Dict with keys: {list(value.keys())}"
|
| | elif isinstance(value, list):
|
| | value = f"List with {len(value)} items"
|
| |
|
| | print(f" {key}: {value}")
|
| |
|
| |
|
| | for key in ['prompt', 'conversation', 'messages']:
|
| | if key in record and isinstance(record[key], (dict, list)):
|
| | print(f"\n Exploring {key}:")
|
| | nested = record[key]
|
| | if isinstance(nested, dict):
|
| | print(f" Keys: {list(nested.keys())}")
|
| | for nkey, nvalue in list(nested.items())[:3]:
|
| | if isinstance(nvalue, str) and len(nvalue) > 50:
|
| | nvalue = nvalue[:50] + "..."
|
| | print(f" {nkey}: {nvalue}")
|
| | elif isinstance(nested, list) and nested:
|
| | print(f" First item type: {type(nested[0])}")
|
| | if isinstance(nested, dict):
|
| | print(f" First item keys: {list(nested.keys())}")
|
| |
|
| | except Exception as e:
|
| | print(f"Error reading file: {e}")
|
| |
|
| | if __name__ == "__main__":
|
| | inspect_downloaded_data()
|
| |
|