Byte-lingua-code / case_study_graph.py
2ira's picture
offline_compression_graph_code
72c0672 verified
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from difflib import Differ
import pandas as pd
# Example collision data
collisions = [
{
"colliding_token_sequence": [265, 393, 320],
"num_raw_variants": 21,
"raw_chunk_variants": [
"\nif __name__ == '_", "\nif __n", "\nif __name__ == '__main",
"\nif __name__ == '__main__'", "\nif __", "\nif _", "\nif __name__ ",
"\nif __name__ =", "\nif __name__ == '__", "\nif __na", "\nif __name__",
"\nif __name", "\nif __name__ == '__main__':", "\nif __name__ == '__main__':\n",
"\nif __nam", "\nif __name_", "\nif __name__ == ", "\nif __name__ == '__ma",
"\nif __name__ == '__main_"
],
"levenshtein_analysis": {
"distances": [11, 5, 8, 12, 13, 5, 4, 1, 10, 6, 8, 9, 10, 9, 7, 2, 3, 6, 7, 3],
"average_distance": 8.74,
"max_distance": 23,
"min_distance": 1
}
},
{
"colliding_token_sequence": [506, 354, 256],
"num_raw_variants": 2,
"raw_chunk_variants": [
"数据", "数�"
],
"levenshtein_analysis": {
"distances": [0, 1],
"average_distance": 0.5,
"max_distance": 1,
"min_distance": 0
}
},
{
"colliding_token_sequence": [123, 456, 789],
"num_raw_variants": 4,
"raw_chunk_variants": [
" } ", " }\r\n ", "!", " }\r\n "
],
"levenshtein_analysis": {
"distances": [2, 1, 4, 7],
"average_distance": 3.5,
"max_distance": 7,
"min_distance": 1
}
}
]
# --- 1. Plot Text Diff (simplified) ---
def plot_text_diff(variants, title, save_path):
differ = Differ()
diff = list(differ.compare(variants[0].splitlines(), variants[1].splitlines()))
fig, ax = plt.subplots(figsize=(8, 6))
ax.set_title(title)
colors = {"+": "red", "-": "green", " ": "blue"}
for i, line in enumerate(diff):
color = colors.get(line[0], "black")
ax.text(0, i, line, color=color, fontsize=12, va='top', ha='left')
plt.axis("off")
plt.savefig(save_path, bbox_inches="tight")
plt.close(fig)
# --- 2. Plot LCP Ratio for Variants ---
def plot_lcp_ratios(lcp_ratios, title, save_path):
plt.figure(figsize=(8, 6))
sns.barplot(x=list(range(len(lcp_ratios))), y=lcp_ratios, color="skyblue")
plt.title(title)
plt.xlabel('Variant Index')
plt.ylabel('LCP Ratio')
plt.savefig(save_path, bbox_inches="tight")
plt.close()
# --- 3. Levenshtein Distance Distribution ---
def plot_levenshtein_distances(distances, title, save_path):
plt.figure(figsize=(8, 6))
sns.histplot(distances, bins=10, kde=True, color="salmon")
plt.title(title)
plt.xlabel('Levenshtein Distance')
plt.ylabel('Frequency')
plt.savefig(save_path, bbox_inches="tight")
plt.close()
# Plot for the first collision case (Example)
for i in range(3):
collision = collisions[i]
plot_text_diff(collision["raw_chunk_variants"], f"Text Difference Visualization Case {i+1}", f"text_diff_case{i+1}.png")
plot_lcp_ratios([collision["levenshtein_analysis"]["average_distance"]]*collision["num_raw_variants"], f"LCP Ratio of Variants Case {i+1}", f"lcp_case{i+1}.png")
plot_levenshtein_distances(collision["levenshtein_analysis"]["distances"], "Levenshtein Distance Distribution", f"levenshtein_case{i+1}.png")
# You can repeat the plotting for other cases as needed
print("Plots generated successfully!")