LLDDWW commited on
Commit
76e08e0
Β·
1 Parent(s): c990a0c

Rewrite Gradio app with proper parsing

Browse files
Files changed (1) hide show
  1. app.py +136 -91
app.py CHANGED
@@ -1,113 +1,158 @@
1
- # app.py (HF Space)
2
-
3
-
4
- # dose_per_intake: e.g., "1회 1μ •", "1μ •", "5 mL"
5
- m_dose = re.search(r"(1회\s*)?(\d+)\s*([κ°€-힣a-zA-Z]+|mL|ml|mg)", t)
6
- dose_per_intake = None
7
- if m_dose:
8
- dose_per_intake = f"{m_dose.group(2)} {m_dose.group(3)}"
9
-
10
-
11
- # drug name (heuristic): token before mg/mL or first uppercase-like word
12
- m_drug = re.search(r"([κ°€-힣A-Za-z]+)\s*(\d+\s*(mg|mL|ml))", t)
13
- drug_name = m_drug.group(1) if m_drug else None
14
-
15
-
16
- return {
17
- "drug_name": drug_name,
18
- "dose_per_intake": dose_per_intake,
19
- "times_per_day": times_per_day,
20
- "time_slots": time_slots or None,
21
- }
22
-
23
-
24
-
25
-
26
- def ocr_and_parse(img) -> Dict[str, Any]:
27
- """Run OCR then parse fields with basic validation."""
28
- # OCR output is a list of dicts with 'generated_text'
29
- raw = ocr(img)[0]["generated_text"]
30
- fields = parse_fields(raw)
31
-
32
-
33
- # basic validation messages
34
- warn = []
35
- if not fields["drug_name"]:
36
- warn.append("μ•½ 이름 인식이 λΆˆν™•μ‹€ν•©λ‹ˆλ‹€.")
37
- if not fields["times_per_day"]:
38
- warn.append("1일 횟수λ₯Ό μ°Ύμ§€ λͺ»ν–ˆμŠ΅λ‹ˆλ‹€ (예: 1일 3회).")
39
-
40
-
41
- return {"raw_text": raw, "fields": fields, "warnings": warn}
42
 
43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
 
46
  def render_card(fields: Dict[str, Any]) -> Image.Image:
47
- """Draw a simple schedule card (PNG) from parsed fields."""
48
- W, H = 720, 400
49
- img = Image.new("RGB", (W, H), "white")
50
- d = ImageDraw.Draw(img)
51
- title = "였늘 볡용 일정"
52
- d.rectangle((0, 0, W, 60), fill=(230, 240, 255))
53
- d.text((24, 18), title, fill=(0, 0, 0))
54
 
 
 
 
55
 
56
- y = 90
57
- def line(label, value):
58
- nonlocal y
59
- d.text((24, y), f"{label}", fill=(60, 60, 60))
60
- d.text((180, y), f": {value if value else '-'}", fill=(0, 0, 0))
61
- y += 34
62
 
 
 
 
 
 
 
63
 
64
- line("μ•½ 이름", fields.get("drug_name"))
65
- line("1회 μš©λŸ‰", fields.get("dose_per_intake"))
66
- line("1일 횟수", fields.get("times_per_day"))
67
- slots = ", ".join(fields.get("time_slots") or [])
68
- line("μ‹œκ°„λŒ€", slots if slots else None)
69
 
 
 
70
 
71
- d.text((24, H-60), "β€» μ˜λ£Œμ§„ 처방 μš°μ„ , λ³Έ 앱은 μ•ˆλ‚΄μš©μž…λ‹ˆλ‹€.", fill=(120, 120, 120))
72
- return img
 
73
 
74
 
 
 
 
 
 
 
 
 
 
75
 
76
 
77
- def run_pipeline(image):
78
- if image is None:
79
- return "이미지λ₯Ό μ—…λ‘œλ“œν•˜μ„Έμš”.", None, None
80
- out = ocr_and_parse(image)
81
- card = render_card(out["fields"])
82
- csv_row = to_csv_row(out)
83
- return json.dumps(out, ensure_ascii=False, indent=2), card, csv_row
84
 
85
-
86
-
87
-
88
- def to_csv_row(out: Dict[str, Any]) -> str:
89
- f = out["fields"]
90
- row = [
91
- f.get("drug_name") or "",
92
- f.get("dose_per_intake") or "",
93
- str(f.get("times_per_day") or ""),
94
- ";".join(f.get("time_slots") or []),
95
- ]
96
- return ",".join(row)
97
 
98
 
99
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
100
- gr.Markdown("# MedCard-KR Β· μ•½λ΄‰νˆ¬ OCR β†’ 볡용 일정 μΉ΄λ“œ")
101
- with gr.Row():
102
- with gr.Column():
103
- img_in = gr.Image(type="pil", label="μ•½ λ΄‰νˆ¬/라벨 사진")
104
- btn = gr.Button("인식 & μΉ΄λ“œ 생성", variant="primary")
105
- csv = gr.Textbox(label="CSV(μ•½λͺ…,1νšŒμš©λŸ‰,1일횟수,μ‹œκ°„λŒ€)")
106
- with gr.Column():
107
- json_out = gr.Code(label="인식 κ²°κ³Ό(JSON)")
108
- card = gr.Image(type="pil", label="일정 μΉ΄λ“œ(미리보기)")
109
- btn.click(run_pipeline, inputs=img_in, outputs=[json_out, card, csv])
110
 
111
 
112
  if __name__ == "__main__":
113
- demo.queue().launch()
 
1
+ import json
2
+ import re
3
+ from typing import Any, Dict, List, Optional
4
+
5
+ import gradio as gr
6
+ from PIL import Image, ImageDraw
7
+ from transformers import pipeline
8
+
9
+ # --- OCR pipeline ---------------------------------------------------------
10
+ # We use a light-weight printed-text OCR model that works well for receipts/labels.
11
+ ocr = pipeline("image-to-text", model="microsoft/trocr-base-printed")
12
+
13
+ # Korean keywords describing time slots on prescription labels.
14
+ TIME_KEYWORDS = [
15
+ "μ•„μΉ¨",
16
+ "점심",
17
+ "저녁",
18
+ "μ·¨μΉ¨",
19
+ "자기",
20
+ "식전",
21
+ "식후",
22
+ "식간",
23
+ "기상",
24
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
 
27
+ def _extract_time_slots(text: str) -> List[str]:
28
+ slots = []
29
+ for kw in TIME_KEYWORDS:
30
+ if kw in text:
31
+ slots.append(kw)
32
+ # Also capture explicit times like 08:00 ν˜Ήμ€ 8μ‹œ
33
+ for match in re.findall(r"(\d{1,2}[:μ‹œ]\d{0,2})", text):
34
+ norm = match.replace("μ‹œ", ":")
35
+ if norm.endswith(":"):
36
+ norm += "00"
37
+ if norm not in slots:
38
+ slots.append(norm)
39
+ return slots
40
+
41
+
42
+ def parse_fields(raw: str) -> Dict[str, Any]:
43
+ """Extract drug name and dosage information from OCR text."""
44
+ text = raw.replace("\n", " ")
45
+ text = re.sub(r"\s+", " ", text)
46
+
47
+ # 1) μ•½ 이름: 단어 + μš©λŸ‰ νŒ¨ν„΄ μ£Όλ³€μ—μ„œ μ°ΎκΈ°
48
+ drug_name: Optional[str] = None
49
+ drug_match = re.search(r"([κ°€-힣A-Za-z]+)\s*(\d+)\s*(mg|mL|ML|μ •)", text)
50
+ if drug_match:
51
+ drug_name = drug_match.group(1)
52
+ else:
53
+ fallback = re.search(r"([κ°€-힣A-Za-z]{2,})", text)
54
+ drug_name = fallback.group(1) if fallback else None
55
+
56
+ # 2) 1회 μš©λŸ‰: "1회 1μ •", "1μ •", "5 mL" λ“±
57
+ dose_per_intake: Optional[str] = None
58
+ dose_match = re.search(r"(1회\s*)?(\d+[\./]?\d*)\s*([κ°€-힣A-Za-z]+|mL|ml|mg|μ •)", text)
59
+ if dose_match:
60
+ dose_per_intake = f"{dose_match.group(2)} {dose_match.group(3)}".strip()
61
+
62
+ # 3) 1일 볡용 횟수: "1일 3회", "ν•˜λ£¨ 2회"
63
+ times_per_day: Optional[int] = None
64
+ times_match = re.search(r"(?:1일|ν•˜λ£¨)\s*(\d+)\s*회", text)
65
+ if times_match:
66
+ times_per_day = int(times_match.group(1))
67
+
68
+ # 4) μ‹œκ°„λŒ€ ν‚€μ›Œλ“œ/μ‹œκ° μΆ”μΆœ
69
+ time_slots = _extract_time_slots(text)
70
+
71
+ return {
72
+ "drug_name": drug_name,
73
+ "dose_per_intake": dose_per_intake,
74
+ "times_per_day": times_per_day,
75
+ "time_slots": time_slots or None,
76
+ }
77
+
78
+
79
+ def ocr_and_parse(image: Image.Image) -> Dict[str, Any]:
80
+ raw_text = ocr(image)[0]["generated_text"]
81
+ fields = parse_fields(raw_text)
82
+
83
+ warnings: List[str] = []
84
+ if not fields["drug_name"]:
85
+ warnings.append("μ•½ 이름 인식이 λΆˆν™•μ‹€ν•©λ‹ˆλ‹€.")
86
+ if not fields["times_per_day"]:
87
+ warnings.append("1일 횟수λ₯Ό μ°Ύμ§€ λͺ»ν–ˆμŠ΅λ‹ˆλ‹€ (예: 1일 3회).")
88
+
89
+ return {"raw_text": raw_text, "fields": fields, "warnings": warnings}
90
 
91
 
92
  def render_card(fields: Dict[str, Any]) -> Image.Image:
93
+ width, height = 720, 400
94
+ img = Image.new("RGB", (width, height), "white")
95
+ draw = ImageDraw.Draw(img)
 
 
 
 
96
 
97
+ header_text = "였늘 볡용 일정"
98
+ draw.rectangle((0, 0, width, 60), fill=(230, 240, 255))
99
+ draw.text((24, 18), header_text, fill=(0, 0, 0))
100
 
101
+ y = 90
 
 
 
 
 
102
 
103
+ def add_line(label: str, value: Optional[str]):
104
+ nonlocal y
105
+ draw.text((24, y), label, fill=(60, 60, 60))
106
+ display = value if value else "-"
107
+ draw.text((180, y), f": {display}", fill=(0, 0, 0))
108
+ y += 34
109
 
110
+ add_line("μ•½ 이름", fields.get("drug_name"))
111
+ add_line("1회 μš©λŸ‰", fields.get("dose_per_intake"))
112
+ add_line("1일 횟수", str(fields.get("times_per_day") or ""))
 
 
113
 
114
+ slots = fields.get("time_slots") or []
115
+ add_line("μ‹œκ°„λŒ€", ", ".join(slots) if slots else None)
116
 
117
+ footer = "β€» μ˜λ£Œμ§„ 처방이 μš°μ„ μ΄λ©°, λ³Έ 앱은 μ°Έκ³ μš©μž…λ‹ˆλ‹€."
118
+ draw.text((24, height - 60), footer, fill=(120, 120, 120))
119
+ return img
120
 
121
 
122
+ def to_csv_row(output: Dict[str, Any]) -> str:
123
+ fields = output["fields"]
124
+ row = [
125
+ fields.get("drug_name") or "",
126
+ fields.get("dose_per_intake") or "",
127
+ str(fields.get("times_per_day") or ""),
128
+ ";".join(fields.get("time_slots") or []),
129
+ ]
130
+ return ",".join(row)
131
 
132
 
133
+ def run_pipeline(image: Optional[Image.Image]):
134
+ if image is None:
135
+ return "이미지λ₯Ό μ—…λ‘œλ“œν•˜μ„Έμš”.", None, None
 
 
 
 
136
 
137
+ output = ocr_and_parse(image)
138
+ card = render_card(output["fields"])
139
+ csv_row = to_csv_row(output)
140
+ json_text = json.dumps(output, ensure_ascii=False, indent=2)
141
+ return json_text, card, csv_row
 
 
 
 
 
 
 
142
 
143
 
144
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
145
+ gr.Markdown("# MedCard-KR Β· μ•½λ΄‰νˆ¬ OCR β†’ 볡용 일정 μΉ΄λ“œ")
146
+ with gr.Row():
147
+ with gr.Column():
148
+ img_in = gr.Image(type="pil", label="μ•½ λ΄‰νˆ¬/라벨 사진")
149
+ btn = gr.Button("인식 & μΉ΄λ“œ 생성", variant="primary")
150
+ csv_box = gr.Textbox(label="CSV(μ•½λͺ…,1νšŒμš©λŸ‰,1일횟수,μ‹œκ°„λŒ€)")
151
+ with gr.Column():
152
+ json_out = gr.Code(label="인식 κ²°κ³Ό(JSON)")
153
+ card_out = gr.Image(type="pil", label="일정 μΉ΄λ“œ(미리보기)")
154
+ btn.click(run_pipeline, inputs=img_in, outputs=[json_out, card_out, csv_box])
155
 
156
 
157
  if __name__ == "__main__":
158
+ demo.queue().launch()