Spaces:
Running
Running
Update synthgen.py
Browse files- synthgen.py +85 -2
synthgen.py
CHANGED
|
@@ -47,8 +47,8 @@ def generate_synthetic_text(
|
|
| 47 |
"model": model,
|
| 48 |
"messages": [
|
| 49 |
{"role": "system", "content": system_message},
|
| 50 |
-
|
| 51 |
-
|
| 52 |
"extra_headers": {
|
| 53 |
# "HTTP-Referer": "YOUR_SITE_URL",
|
| 54 |
"X-Title": "SynthGen",
|
|
@@ -194,6 +194,89 @@ def generate_synthetic_conversation(
|
|
| 194 |
|
| 195 |
return f"Generated conversation for prompt '{system_prompt}':\n\n{conversation_text}"
|
| 196 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 197 |
|
| 198 |
# --- Main Execution (Example Usage) ---
|
| 199 |
if __name__ == "__main__":
|
|
|
|
| 47 |
"model": model,
|
| 48 |
"messages": [
|
| 49 |
{"role": "system", "content": system_message},
|
| 50 |
+
{"role": "user", "content": prompt},
|
| 51 |
+
],
|
| 52 |
"extra_headers": {
|
| 53 |
# "HTTP-Referer": "YOUR_SITE_URL",
|
| 54 |
"X-Title": "SynthGen",
|
|
|
|
| 194 |
|
| 195 |
return f"Generated conversation for prompt '{system_prompt}':\n\n{conversation_text}"
|
| 196 |
|
| 197 |
+
# Function to generate different types of content based on a topic
|
| 198 |
+
def generate_corpus_content(
|
| 199 |
+
topic: str,
|
| 200 |
+
content_type: str, # e.g., "Corpus Snippets", "Short Story", "Article"
|
| 201 |
+
length_param: int, # Meaning depends on type (e.g., num snippets, approx words)
|
| 202 |
+
model: str,
|
| 203 |
+
system_message_base: str = "You are a helpful assistant generating synthetic content.",
|
| 204 |
+
temperature: Optional[float] = 0.7,
|
| 205 |
+
top_p: Optional[float] = None,
|
| 206 |
+
max_tokens: Optional[int] = None # Use a larger default if None
|
| 207 |
+
) -> str:
|
| 208 |
+
"""
|
| 209 |
+
Generates different types of synthetic content based on a topic.
|
| 210 |
+
|
| 211 |
+
Args:
|
| 212 |
+
topic: The central topic for the content.
|
| 213 |
+
content_type: The type of content to generate.
|
| 214 |
+
length_param: A parameter controlling length/quantity (meaning depends on type).
|
| 215 |
+
model: The model ID.
|
| 216 |
+
system_message_base: Base system message (will be specialized).
|
| 217 |
+
temperature: Model temperature.
|
| 218 |
+
top_p: Model top_p.
|
| 219 |
+
max_tokens: Model max_tokens.
|
| 220 |
+
|
| 221 |
+
Returns:
|
| 222 |
+
The generated content string or an error message.
|
| 223 |
+
"""
|
| 224 |
+
|
| 225 |
+
prompt = ""
|
| 226 |
+
system_message = system_message_base # Start with base
|
| 227 |
+
|
| 228 |
+
# --- Construct Prompt based on Content Type ---
|
| 229 |
+
if content_type == "Corpus Snippets":
|
| 230 |
+
if length_param <= 0: length_param = 5 # Default number of snippets
|
| 231 |
+
prompt = (
|
| 232 |
+
f"Generate exactly {length_param} distinct text snippets related to the topic: '{topic}'. "
|
| 233 |
+
f"Each snippet should be a few sentences long and focus on a different aspect if possible. "
|
| 234 |
+
f"Present each snippet clearly, perhaps separated by a blank line or a marker like '---'."
|
| 235 |
+
)
|
| 236 |
+
system_message = "You are an AI generating diverse text snippets for a data corpus."
|
| 237 |
+
# Adjust max_tokens based on expected number of snippets if not set
|
| 238 |
+
if max_tokens is None: max_tokens = length_param * 150 # Estimate
|
| 239 |
+
|
| 240 |
+
elif content_type == "Short Story":
|
| 241 |
+
if length_param <= 0: length_param = 300 # Default approx words
|
| 242 |
+
prompt = (
|
| 243 |
+
f"Write a short story (approximately {length_param} words) centered around the topic: '{topic}'. "
|
| 244 |
+
f"The story should have a clear beginning, middle, and end."
|
| 245 |
+
)
|
| 246 |
+
system_message = "You are a creative AI writing a short story."
|
| 247 |
+
if max_tokens is None: max_tokens = int(length_param * 2.5) # Estimate
|
| 248 |
+
|
| 249 |
+
elif content_type == "Article":
|
| 250 |
+
if length_param <= 0: length_param = 500 # Default approx words
|
| 251 |
+
prompt = (
|
| 252 |
+
f"Write an informative article (approximately {length_param} words) about the topic: '{topic}'. "
|
| 253 |
+
f"The article should be well-structured, factual (to the best of your ability), and engaging."
|
| 254 |
+
)
|
| 255 |
+
system_message = "You are an AI assistant writing an informative article."
|
| 256 |
+
if max_tokens is None: max_tokens = int(length_param * 2.5) # Estimate
|
| 257 |
+
|
| 258 |
+
else:
|
| 259 |
+
return f"Error: Unknown content type '{content_type}'."
|
| 260 |
+
|
| 261 |
+
if not prompt:
|
| 262 |
+
return "Error: Could not construct a valid prompt."
|
| 263 |
+
|
| 264 |
+
# --- Call the core generation function ---
|
| 265 |
+
generated_text = generate_synthetic_text(
|
| 266 |
+
prompt=prompt,
|
| 267 |
+
model=model,
|
| 268 |
+
system_message=system_message,
|
| 269 |
+
temperature=temperature,
|
| 270 |
+
top_p=top_p,
|
| 271 |
+
max_tokens=max_tokens
|
| 272 |
+
)
|
| 273 |
+
|
| 274 |
+
# Return the result (includes potential errors from generate_synthetic_text)
|
| 275 |
+
# Add a title for clarity
|
| 276 |
+
if not generated_text.startswith("Error:"):
|
| 277 |
+
return f"Generated {content_type} for topic '{topic}':\n\n{generated_text}"
|
| 278 |
+
else:
|
| 279 |
+
return generated_text # Propagate the error
|
| 280 |
|
| 281 |
# --- Main Execution (Example Usage) ---
|
| 282 |
if __name__ == "__main__":
|