Spaces:

jibsn
/

OCSU

Sleeping

App Files Files Community

OCSU / det_engine.py

jibsn

Upload 5 files

6d6d481 verified 11 months ago

raw

history blame contribute delete

263 kB

	"""
	Copyright (c) All Rights Reserved
	by bowen
	"""

	import json
	import math
	import os
	import sys
	import pathlib
	from typing import Iterable, List
	import random
	import itertools

	import numpy as np
	import pandas as pd
	import tqdm
	import torch
	import torch.amp
	from PIL import Image
	# from src.data import CocoEvaluator
	# from src.misc import (MetricLogger, SmoothedValue, reduce_dict)
	# from src.solver.utils import output_to_smiles, output_to_smiles2
	# from src.solver.utils import bbox_to_graph_with_charge, mol_from_graph_with_chiral
	# from src.misc.draw_box_utils import draw_objs

	# from sklearn.metrics import f1_score
	# from src.postprocess.abbreviation_detector import get_ocr_recognition_only
	# from src.postprocess.utils_dataset import CaptionRemover
	from skimage.measure import label
	######################################add metric postprocess
	import rdkit
	from rdkit import Chem
	from rdkit.Chem import Draw, AllChem
	from rdkit.Chem import rdchem, RWMol, CombineMols
	from rdkit import Chem
	from rdkit.Chem import rdFMCS
	import copy
	from paddleocr import PaddleOCR
	import re
	from rdkit import DataStructs
	import matplotlib.pyplot as plt
	from matplotlib.patches import Rectangle, Circle
	from scipy.spatial import cKDTree, KDTree
	from rdkit.Geometry import Point3D
	import multiprocessing



	def select_longest_smiles(smiles):
	# 将 SMILES 以 '.' 分割为多个部分
	components = smiles.split('.')
	# 选择字符数最多的部分作为主结构
	longest_component = max(components, key=len)
	return longest_component

	def MCS_mol(mcs):
	#mcs_smart = mcs.smartsString
	mcs_mol = Chem.MolFromSmarts(mcs.smartsString)
	AllChem.Compute2DCoords(mcs_mol)
	return mcs_mol

	def g_atompair_matches(pair,mcs):
	mcs_mol = MCS_mol(mcs)
	matches0 = pair[0].GetSubstructMatches(mcs_mol, useQueryQueryMatches=True,uniquify=False, maxMatches=1000, useChirality=False)
	matches1 = pair[1].GetSubstructMatches(mcs_mol, useQueryQueryMatches=True,uniquify=False, maxMatches=1000, useChirality=False)
	if len(matches0) != len(matches1):
	matches0=list(matches0)
	matches1=list(matches1)
	print( " g_atompair_matches noted: matcher not equal !!")
	if len(matches0)>len(matches1) and len(matches1) !=0:
	for i in range(0,len(matches0)):
	if i < len(matches1):
	pass
	else:
	ii=i % len(matches1)
	matches1.append(matches1[ii])
	else:
	for i in range(0,len(matches1)):
	if i < len(matches0) and len(matches0):
	pass
	else:
	ii=i % len(matches0)
	matches0.append(matches0[ii])
	# assert len(matches0) == len(matches1), "matcher not equal break!!"
	if len(matches0) != len(matches1):
	atommaping_pairs=[[]]
	else:atommaping_pairs=[list(zip(matches0[i],matches1[i])) for i in range(0,len(matches0))]
	return atommaping_pairs


	class CustomError(Exception):
	"""A custom exception for specific errors."""
	pass

	bond_dirs = {'NONE': Chem.rdchem.BondDir.NONE,
	'ENDUPRIGHT': Chem.rdchem.BondDir.ENDUPRIGHT,
	'BEGINWEDGE': Chem.rdchem.BondDir.BEGINWEDGE,
	'BEGINDASH': Chem.rdchem.BondDir.BEGINDASH,
	'ENDDOWNRIGHT': Chem.rdchem.BondDir.ENDDOWNRIGHT,}

	BONDTYPE = {'SINGLE': Chem.rdchem.BondType.SINGLE,
	'DOUBLE': Chem.rdchem.BondType.DOUBLE,
	'TRIPLE': Chem.rdchem.BondType.TRIPLE,
	'AROMATIC': Chem.rdchem.BondType.AROMATIC}
	BOND_DIRS = {'NONE': Chem.rdchem.BondDir.NONE,
	'ENDUPRIGHT': Chem.rdchem.BondDir.ENDUPRIGHT,
	'BEGINWEDGE': Chem.rdchem.BondDir.BEGINWEDGE,
	'BEGINDASH': Chem.rdchem.BondDir.BEGINDASH,
	'ENDDOWNRIGHT': Chem.rdchem.BondDir.ENDDOWNRIGHT,}
	BONDDIRECT=['ENDUPRIGHT', 'BEGINWEDGE', 'BEGINDASH', 'ENDDOWNRIGHT']


	BONDTYPE2ORD={
	'wdge':1,
	'dash':1,
	Chem.rdchem.BondType.SINGLE: 1,
	Chem.rdchem.BondType.DOUBLE: 2,
	Chem.rdchem.BondType.TRIPLE: 3,
	Chem.rdchem.BondType.AROMATIC: 1.5,
	}

	BONDTYPE={'SINGLE': Chem.BondType.SINGLE,
	'DOUBLE': Chem.BondType.DOUBLE,
	'TRIPLE': Chem.BondType.TRIPLE,
	'AROMATIC': Chem.BondType.AROMATIC}

	VALENCES = {
	"H": [1], "Li": [1], "Be": [2], "B": [3], "C": [4], "N": [3, 5], "O": [2], "F": [1],
	"Na": [1], "Mg": [2], "Al": [3], "Si": [4], "P": [5, 3], "S": [6, 2, 4], "Cl": [1], "K": [1], "Ca": [2],
	"Br": [1], "I": [1], "*":[3,4,5,6],
	}

	ELEMENTS = [
	"H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne",
	"Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar", "K", "Ca",
	"Sc", "Ti", "Ru", "Rh","Rn","Rf", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn",
	"Ga", "Ge", "As", "Se", "Br", "Kr", "Sr", "Zr",
	"Nb", "Mo", "Tc", "Pd", "Ag", "Cd", "In", "Sn",
	"Sb", "Te", "I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd",
	"Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb",
	"Lu", "Hf", "Ta", "W", "Os", "Ir", "Pt", "Au", "Hg",
	"Tl", "Pb", "Bi", "Po", "At", "Fr", "Ac", "Th",
	"Pa", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm",
	"Md", "No", "Lr", "Db", "Sg", "Bh", "Hs", "Mt", "Ds",
	"Cn", "Nh", "Fl", "Mc", "Lv", "Og"
	]
	# "Rg", "Rb", "Re", "Ra"as RGROUP in the Molscribe data
	#"V", "Y","U", # be viewed as C for paddleOCR smt ONELEMENTS ['A','J]
	#"Ts" #as a chemical group [S](C1=CC=C(C=C1)C)(=O)=O
	RGROUP_SYMBOLS = ['R',"R'" 'R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8', 'R9', 'R10', 'R11', 'R12',
	'Ra', 'Rb', 'Rc', 'Rd','Re','Rg', 'X', 'Y', 'Z', 'Q', 'A', 'E', 'Ar',
	"V", "Y","U",'M', 'G','L',
	'Nr','Tt','Uu','Vv','Ww',#CLEF Nr is not in periodic table
	'D',#CLEF as [2H] but not recongited by rdkit chemdraw
	]

	COLORS = {
	u'c': '0.0,0.75,0.75', u'b': '0.0,0.0,1.0', u'g': '0.0,0.5,0.0', u'y': '0.75,0.75,0',
	u'k': '0.0,0.0,0.0', u'r': '1.0,0.0,0.0', u'm': '0.75,0,0.75'
	}

	class Substitution(object):
	'''Define common substitutions for chemical shorthand'''
	def __init__(self, abbrvs, smarts, smiles, probability):
	assert type(abbrvs) is list
	self.abbrvs = abbrvs
	self.smarts = smarts
	self.smiles = smiles
	self.probability = probability

	SUBSTITUTIONS: List[Substitution] = [
	#abbrvs, smarts, smiles
	#patch4 USPTO,try put the longer one first, as re use match by order
	Substitution(['CH2CH2NSO2CH3'], '[CH2][CH]', '[CH2]CNS(=O)(C)=O', 0.5),
	Substitution(['NHNHCOCF3'], 'NHNHCOCF3', '[NH]NC(=O)C(F)(F)(F)', 0.5),
	Substitution(['CO2CysPr'], 'CO2CysPr', '[C](=O)ON[C@H](C(CCC)=O)CS', 0.5),
	Substitution(['OCH2CHOHCH2'], 'OCH2CHOHCH2', '[O]CC(O)C', 0.5),
	Substitution(['OCH2CHOHCH2OH'], 'OCH2CHOHCH2', '[O]CC(O)CO', 0.5),
	# elif symbol in ['SO2(CH2)3SO2NHCH2CHCH2OH']:smiles='[S](=O)(=O)CCCS(=O)(=O)NC[C]CO'
	Substitution(['SO2(CH2)3SO2NHCH2CHCH2OH'], 'OCH2CHOHCH2', '[S](=O)(=O)CCCS(=O)(=O)NC[C]CO', 0.5),




	Substitution(['NO2', 'O2N'], '[N+](=O)[O-]', "[N+](=O)[O-]", 0.5),
	# Substitution(['CHO', 'OHC'], '[CH1](=O)', "[CH1](=O)", 0.5),
	Substitution(['CO2Et', 'COOEt'], 'C(=O)[OH0;D2][CH2;D2][CH3]', "[C](=O)OCC", 0.5),

	Substitution(['OAc','AcO'], '[OH0;X2]C(=O)[CH3]', "[O]C(=O)C", 0.7),
	Substitution(['NHAc'], '[NH1;D2]C(=O)[CH3]', "[NH]C(=O)C", 0.7),
	Substitution(['Ac'], 'C(=O)[CH3]', "[C](=O)C", 0.1),

	Substitution(['OBz','BzO'], '[OH0;D2]C(=O)[cH0]1[cH][cH][cH][cH][cH]1', "[O]C(=O)c1ccccc1", 0.7), # Benzoyl
	Substitution(['Bz'], 'C(=O)[cH0]1[cH][cH][cH][cH][cH]1', "[C](=O)c1ccccc1", 0.2), # Benzoyl

	Substitution(['COOBn','BnO2C'], '[OH0;D2][CH2;D2][cH0]1[cH][cH][cH][cH][cH]1', "[C](=O)OCc1ccccc1", 0.7), # Benzyl
	Substitution(['OBn','BnO'], '[OH0;D2][CH2;D2][cH0]1[cH][cH][cH][cH][cH]1', "[O]Cc1ccccc1", 0.7), # Benzyl
	Substitution(['Bn'], '[CH2;D2][cH0]1[cH][cH][cH][cH][cH]1', "[CH2]c1ccccc1", 0.2), # Benzyl
	Substitution(['NHBn'], '[NH]Cc1ccccc1', "[NH]Cc1ccccc1", 0.2), # Benzyl
	Substitution(['NBn2'], '[NH]Cc1ccccc1', "[N](Cc1ccccc1)Cc1ccccc1", 0.2), # Benzyl

	Substitution(['NHBoc','BocHN',"BOCHN"], '[NH1;D2]C(=O)OC([CH3])([CH3])[CH3]', "[NH]C(=O)OC(C)(C)C", 0.6),
	Substitution(['NBoc'], '[NH0;D3]C(=O)OC([CH3])([CH3])[CH3]', "[NH1]C(=O)OC(C)(C)C", 0.6),
	Substitution(['Boc','BOc'], 'C(=O)OC([CH3])([CH3])[CH3]', "[C](=O)OC(C)(C)C", 0.2),

	Substitution(['Cbm'], 'C(=O)[NH2;D1]', "[C](=O)N", 0.2),
	Substitution(['Cbz'], 'C(=O)OC[cH]1[cH][cH][cH1][cH][cH]1', "[C](=O)OCc1ccccc1", 0.4),
	Substitution(['NHCbz'], 'C(=O)OC[cH]1[cH][cH][cH1][cH][cH]1', "[NH]C(=O)OCc1ccccc1", 0.4),
	Substitution(['Cy'], '[CH1;X3]1[CH2][CH2][CH2][CH2][CH2]1', "[CH1]1CCCCC1", 0.3),
	Substitution(['Fmoc'], 'C(=O)O[CH2][CH1]1c([cH1][cH1][cH1][cH1]2)c2c3c1[cH1][cH1][cH1][cH1]3',
	"[C](=O)OCC1c(cccc2)c2c3c1cccc3", 0.6),
	Substitution(['FmocHN','FmOcHN', 'NHFmoc'], 'C(=O)O[CH2][CH1]1c([cH1][cH1][cH1][cH1]2)c2c3c1[cH1][cH1][cH1][cH1]3',
	"[NH]C(=O)OCC1c(cccc2)c2c3c1cccc3", 0.6),
	Substitution(['Mes'], '[cH0]1c([CH3])cc([CH3])cc([CH3])1', "[c]1c(C)cc(C)cc(C)1", 0.5),
	Substitution(['OMs','MsO'], '[OH0;D2]S(=O)(=O)[CH3]', "[O]S(=O)(=O)C", 0.7),
	Substitution(['Ms'], 'S(=O)(=O)[CH3]', "[S](=O)(=O)C", 0.2),
	Substitution(['Ph'], '[cH0]1[cH][cH][cH1][cH][cH]1', "[c]1ccccc1", 0.5),

	Substitution(['PMB'], '[CH2;D2][cH0]1[cH1][cH1][cH0](O[CH3])[cH1][cH1]1', "[CH2]c1ccc(OC)cc1", 0.2),
	Substitution(['PMBN'], '[CH2;D2][cH0]1[cH1][cH1][cH0](O[CH3])[cH1][cH1]1', "[N]Cc1ccc(OC)cc1", 0.2),
	Substitution(['Py'], '[cH0]1[n;+0][cH1][cH1][cH1][cH1]1', "[c]1ncccc1", 0.1),
	# Substitution(['SEM','MES'], '[CH2;D2][CH2][Si]([CH3])([CH3])[CH3]', "[CH2]CSi(C)(C)C", 0.2),
	Substitution(['SEM','MES'], '[CH2;D2][O][CH2][CH2][Si]([CH3])([CH3])[CH3]', "[CH2]OCC[Si](C)(C)C", 0.2),#fix above

	Substitution(['Suc'], 'C(=O)[CH2][CH2]C(=O)[OH]', "[C](=O)CCC(=O)O", 0.2),
	Substitution(['TBS'], '[Si]([CH3])([CH3])C([CH3])([CH3])[CH3]', "[Si](C)(C)C(C)(C)C", 0.5),
	Substitution(['TBZ'], 'C(=S)[cH]1[cH][cH][cH1][cH][cH]1', "[C](=S)c1ccccc1", 0.2),
	Substitution(['OTf'], '[OH0;D2]S(=O)(=O)C(F)(F)F', "[O]S(=O)(=O)C(F)(F)F", 0.7),
	Substitution(['Tf'], 'S(=O)(=O)C(F)(F)F', "[S](=O)(=O)C(F)(F)F", 0.2),
	Substitution(['TFA'], 'C(=O)C(F)(F)F', "[C](=O)C(F)(F)F", 0.3),
	Substitution(['TFAH2N'], 'C(=O)C(F)(F)F', "[NH]C(=O)C(F)(F)F", 0.3),
	Substitution(['TMS'], '[Si]([CH3])([CH3])[CH3]', "[Si](C)(C)C", 0.5),
	Substitution(['Ts'], 'S(=O)(=O)c1[cH1][cH1][cH0]([CH3])[cH1][cH1]1', "[S](=O)(=O)c1ccc(C)cc1", 0.6), # Ts
	Substitution(['TsO','OTs'], '[O]S(C1=CC=C(C=C1)C)(=O)=O', "[O]S(C1=CC=C(C=C1)C)(=O)=O", 0.6), # Ts

	Substitution(['COCH3'], '[OH0;D2][CH3;D1]', "[C](=O)C", 0.3),
	# Alkyl chains
	Substitution(['OMe', 'MeO','H;CO', 'CH3O','OCH3', 'H3CO'], '[OH0;D2][CH3;D1]', "[O]C", 0.3),
	Substitution(['SMe', 'MeS'], '[SH0;D2][CH3;D1]', "[S]C", 0.3),
	Substitution(['NMe', 'MeN'], '[N;X3][CH3;D1]', "[N]C", 0.3),#modified as [NH]not wanted
	Substitution(['NMe2', 'Me2N'], '[N;X3](C)[CH3;D1]', "[N](C)C", 0.3),#modified as [NH]not wanted

	Substitution(['Me'], '[CH3;D1]', "[CH3]", 0.1),
	Substitution(['OEt', 'EtO','C2H5O','OC2H5'], '[OH0;D2][CH2;D2][CH3]', "[O]CC", 0.5),
	Substitution(['MeOH2C','CH2OMe'], '[CH2;D2]O[CH3]', "[CH2]OC", 0.5),
	Substitution(['Et', 'CH2CH3','CH3CH2'], '[CH2;D2][CH3]', "[CH2]C", 0.3),


	Substitution(['Pr', 'nPr', 'n-Pr'], '[CH2;D2][CH2;D2][CH3]', "[CH2]CC", 0.3),
	Substitution(['Bu', 'nBu', 'n-Bu'], '[CH2;D2][CH2;D2][CH2;D2][CH3]', "[CH2]CCC", 0.3),
	# Substitution(['nBu', 'n-Bu'], '[CH2;D2][CH2;D2][CH2;D2][CH3]', "[CH2]CCC", 0.3),

	# Branched
	Substitution(['iPr', 'i-Pr'], '[CH1;D3]([CH3])[CH3]', "[CH1](C)C", 0.2),
	Substitution(['iBu', 'i-Bu'], '[CH2;D2][CH1;D3]([CH3])[CH3]', "[CH2]C(C)C", 0.2),
	Substitution(['OiBu'], '[OH0;D2][CH2;D2][CH1;D3]([CH3])[CH3]', "[O]CC(C)C", 0.2),
	Substitution(['OtBu','tBuO'], '[OH0;D2][CH0]([CH3])([CH3])[CH3]', "[O]C(C)(C)C", 0.6),
	Substitution(['tBu', 't-Bu'], '[CH0]([CH3])([CH3])[CH3]', "[C](C)(C)C", 0.3),

	# Other shorthands (MIGHT NOT WANT ALL OF THESE)
	Substitution(['CF3', 'F3C'], '[CH0;D4](F)(F)F', "[C](F)(F)F", 0.5),
	Substitution(['NCF3', 'F3CN'], '[N;X3][CH0;D4](F)(F)F', "[NH]C(F)(F)F", 0.5),
	Substitution(['OCF3', 'F3CO'], '[OH0;X2][CH0;D4](F)(F)F', "[O]C(F)(F)F", 0.5),
	Substitution(['OCCl3', 'Cl3CO'], '[OH0;X2][CH0;D4](Cl)(Cl)Cl', "[O]C(Cl)(Cl)Cl", 0.5),
	Substitution(['SCF3', 'F3CS'], '[SH0;X2][CH0;D4](F)(F)F', "[S]C(F)(F)F", 0.5),
	Substitution(['CCl3'], '[CH0;D4](Cl)(Cl)Cl', "[C](Cl)(Cl)Cl", 0.5),
	Substitution(['CO2H', 'HO2C', 'COOH'], 'C(=O)[OH]', "[C](=O)O", 0.5), # COOH
	Substitution(['CO2NH4','COONH4','H4NOOC','H4NO2C'], 'C(=O)[OH]', "[C](=O)ON", 0.5), # COOH
	Substitution([ 'COO-','CO2-'], 'C(=O)[OH]', "[C](=O)[O-]", 0.5), # COOH
	# Substitution([ 'COO'], 'C(=O)[OH]', "[C](=O)O", 0.5), # COOH
	Substitution(['CN', 'NC'], 'C#[ND1]', "[C]#N", 0.5),
	# Substitution(['OCH3', 'H3CO'], '[OH0;D2][CH3]', "[O]C", 0.4),
	#TODO if need just addit here
	Substitution(['N3'], '[N]=[N+]=[N-]', "[N]=[N+]=[N-]", 0.4),#ACS image dataset has
	# [N-]=[N+]
	Substitution(['N2+Cl-','Cl-N2+'], '[N+]#[N].[Cl-]', "[N+]#[N].[Cl-]", 0.4),#ACS image dataset has
	Substitution(['N2'], '[N]=[N-]', "[N]=[N-]", 0.4),#ACS image dataset has
	Substitution(['N2H'], '[N]=[N-]', "[N]=[NH]", 0.4),#ACS image dataset has
	Substitution(['NO','N=O','O=N','ON'], '[N]=[O]', "[N]=O", 0.4),#ACS image dataset has
	Substitution(['NCH3'], '[N]C', "[NH]C", 0.4),#ACS image dataset has
	Substitution(['NOMe'], '[N]OC', "[N]OC", 0.4),#ACS image dataset has
	Substitution(['OCH2'], '[O]C', "[O]C", 0.4),#FORMULA_REGEX
	Substitution(['C=O','O=C'], '[C]=[O]', "[C]=O", 0.4),#ACS image dataset has
	Substitution(['NPh','PhN'], 'NC1=CC=CC=C1', "[N]C1=CC=CC=C1", 0.4),#ACS image dataset has
	Substitution(['NHPh','PhNH','PhHN'], 'NC1=CC=CC=C1', "[NH]C1=CC=CC=C1", 0.4),#ACS image dataset has
	Substitution(['TMSO','OSMT'], 'O[Si](C)(C)C', "[O][Si](C)(C)C", 0.5),
	Substitution(['SPh','PhS'], 'SC1=CC=CC=C1', "[S]C1=CC=CC=C1", 0.4),#ACS image dataset has
	Substitution(['SO3H'], 'S(=O)(=O)[OH]', "[S](=O)(=O)O", 0.4),
	Substitution(['SO3NH2','SO3NH4','H4NO3S'], 'S(=O)(=O)[OH]', "[S](=O)(=O)ON", 0.4),
	Substitution(['SO3'], 'S(=O)(=O)[OH]', "[S](=O)(=O)[O-]", 0.4),
	Substitution(['SO2CF3'], '[S](=O)(=O)C(F)(F)F', "[S](=O)(=O)C(F)(F)F", 0.5),
	Substitution(['SO2Cl'], '[S](=O)(=O)Cl', "[S](=O)(=O)Cl", 0.5),
	Substitution(['SO2F'], '[S](=O)(=O)F', "[S](=O)(=O)F", 0.5),
	Substitution(['SO2'], '[S](=O)(=O)', "[S](=O)(=O)", 0.5),
	Substitution(['SO2NH'], '[S](=O)(=O)[N]', "[S](=O)(=O)[N]", 0.5),#US07323045-20080129-C00062 may lead wrong connext
	Substitution(['SO2NH2'], '[S](=O)(=O)[NH2]', "[S](=O)(=O)[NH2]", 0.5),
	Substitution(['SO2Me','SO2CH3'], '[S](=O)(=O)C', "[S](=O)(=O)C", 0.5),
	Substitution(['NHO2S'], '[S](=O)(=O)[N]', "[N][S](=O)(=O)", 0.5),#US07323045-20080129-C00062 may lead wrong connext
	Substitution(['OSO2Me'], '[O]S(=O)(=O)C', "[O]S(=O)(=O)C", 0.5),
	Substitution(['NHSO2Me'], '[NH]S(=O)(=O)C', "[NH]S(=O)(=O)C", 0.5),
	Substitution(['SOCH3','SOMe'], '[S](=O)(=O)', "[S](=O)C", 0.5),

	Substitution(['P+Ph3Br-'], '[P+](C1=CC=CC=C1)(C2=CC=CC=C2)C3=CC=CC=C3', "[P+](C1=CC=CC=C1)(C2=CC=CC=C2)C3=CC=CC=C3", 0.5),
	Substitution(['N+Ph3Br-'], '[N+](C1=CC=CC=C1)(C2=CC=CC=C2)C3=CC=CC=C3', "[N+](C1=CC=CC=C1)(C2=CC=CC=C2)C3=CC=CC=C3", 0.5),
	Substitution(['PPh2'], "[P](C1=CC=CC=C1)C2=CC=CC=C2", "[P](C1=CC=CC=C1)C2=CC=CC=C2", 0.5),
	# Substitution(['BOcHN',"BOCHN"], "[NH]C(OC(C)(C)C)=O", "[NH]C(OC(C)(C)C)=O", 0.5),
	Substitution(['CO2Me', 'COOMe'], 'C(=O)[OH0;D2][CH3]', "[C](=O)OC", 0.5),
	Substitution(['ONa', 'NaO'], '[O][Na]', "[O][Na]", 0.5),
	Substitution(['OTBDMS', 'TBDMSO'], "[O][Si](C)(C)C(C)(C)C", "[O][Si](C)(C)C(C)(C)C", 0.5),
	Substitution(['CONH2'], '[C](O)(N)', "[C](=O)[NH2]", 0.5),
	Substitution(['NHNH2'], '[NH2;D1]', "[NH]N", 0.1),
	Substitution(['CONH'], 'CONH', '[C](=O)N', 0.5),
	Substitution(['CH3CONH'], '[NH]C(=O)C', '[NH]C(=O)C', 0.5),
	Substitution(['NH3Cl'], '[NH]Cl', '[NH]Cl', 0.5),

	Substitution(['SAc','AcS'], '[S]C(C)=O', "[S]C(C)=O", 0.5),
	Substitution(['OAll'], '[O]CC=C', '[O]CC=C', 0.5),
	# Substitution(['Tos'], '[Si](C)(C)C', '[Si](C)(C)C', 0.5),#NOTE different case ?? @@acs dataset ,we use the SO2here
	Substitution(['Tos','TOs'], '[Si](C)(C)C', '[S](=O)(=O)C(C=C1)=CC=C1C', 0.5),#NOTE different case ??
	Substitution(['OTos','OTOs','soTO'], '[Si](C)(C)C', '[O]S(=O)(=O)C(C=C1)=CC=C1C', 0.5),#NOTE different case ??
	Substitution(['TsN'], '[N]S(C1=CC=C(C=C1)C)(=O)=O', '[N]S(C1=CC=C(C=C1)C)(=O)=O', 0.5),
	Substitution(['Ts'], '[S](C1=CC=C(C=C1)C)(=O)=O', '[S](C1=CC=C(C=C1)C)(=O)=O', 0.5),
	Substitution(['COCF3'], '[C](=O)C(F)(F)(F)', '[C](=O)C(F)(F)(F)', 0.5),
	Substitution(['CF2', 'F2C'], '[C;D4](F)(F)', "[C](F)(F)", 0.5),
	Substitution(['PMB'], '[CH2]C1=CC=C(C=C1)OC', '[CH2]C1=CC=C(C=C1)OC', 0.5),
	Substitution(['NHCOtBu'], '[NH]C(C(C)(C)C)=O','[NH]C(C(C)(C)C)=O', 0.5),
	Substitution(['OCN'], '[N]=C=O', "[N]=C=O", 0.5),
	Substitution(['Me3Si'], '[Si](C)(C)(C)', "[Si](C)(C)(C)", 0.5),
	Substitution(['PhO','OPh'], '[O]C1=CC=CC=C1', "[O]C1=CC=CC=C1", 0.5),
	Substitution(['Allyl'], '[CH2]C=C', '[CH2]C=C', 0.5),
	Substitution(['C7H3'], '[C]#CC#CC#CC', '[C]#CC#CC#CC', 0.5),
	Substitution(['C5H11'], '[CH2]CCCC', '[CH2]CCCC', 0.5),
	Substitution(['R1R2N'], "[N]([])[]", "[N]([])[]", 0.5),
	Substitution(['CO2R'], '[C](=O)O', '[C](=O)O', 0.5),
	Substitution(['CCl3CH2O2C'], '[C](=O)OCC(Cl)(Cl)Cl', '[C](=O)OCC(Cl)(Cl)Cl', 0.5),
	Substitution(['NHOH'], '[NH]O', '[NH]O', 0.5),
	Substitution(['CO2'], '[C](=O)[O]', '[C](=O)[O]', 0.5),
	Substitution(['O2C'], '[C](=O)[O]', '[O][C](=O)', 0.5),#NOTE the direction matters

	Substitution(['PPh3'], '[P](C1=CC=CC=C1)(C2=CC=CC=C2)C3=CC=CC=C3', '[P](C1=CC=CC=C1)(C2=CC=CC=C2)C3=CC=CC=C3', 0.5),
	Substitution(['TfO'], '[C](=O)[O]', '[O]S(=O)(C(F)(F)F)=O', 0.5),
	Substitution(['OCH2Ph'], '[O]CC1=CC=CC=C1', '[O]CC1=CC=CC=C1', 0.5),
	Substitution(['OCH2CF3'], '[O]CC(F)(F)(F)', '[O]CC(F)(F)(F)', 0.5),
	Substitution(['COOCH2Ph'], '[C](=O)OCC1=CC=CC=C1', '[C](=O)OCC1=CC=CC=C1', 0.5),
	Substitution(['OCH2OC2H5'], '[C](=O)C(C)(C)C', '[O]COCC', 0.5),

	Substitution(['Trt'], '[C](C1=CC=CC=C1)(C2=CC=CC=C2)C3=CC=CC=C3', '[C](C1=CC=CC=C1)(C2=CC=CC=C2)C3=CC=CC=C3', 0.5),
	Substitution(['SF5'], '[S](F)(F)(F)(F)F', '[S](F)(F)(F)(F)F', 0.5),

	# Substitution(['CH2CH'], '[CH2][CH]', '[CH2][CH]', 0.5),
	# Substitution(['CH2CH2'], '[CH2][CH2]', '[CH2][CH2]', 0.5),

	# #SIMPLE abbv
	Substitution(['S'], '[S]', '[S]*', 0.5),
	Substitution(['N, NH'], '[NH]', '[NH]', 0.5),
	Substitution(['C','CH2'], '[C]', '[CH2]', 0.5),
	Substitution(['P',"PH"], '[P]', '[PH]', 0.5),
	Substitution(['O'], '[O]', '[O]*', 0.5),
	#（） effect
	Substitution(['N(CH3)2'], '[N](C)(C)', "[N](C)(C)", 0.5),
	Substitution(['(C2H5)2N','Et2N'], '[N](C)(C)', "[N](CC)(CC)", 0.5),
	Substitution(['B(OH)2'], '[B](O)O', "[B](O)O", 0.5),
	Substitution(['CO2C(CH3)3'], '[C](=O)C(C)(C)C', '[C](=O)C(C)(C)C', 0.5),
	Substitution(['P(O)(OEt)2', 'P(OEt)2(O)'], "[P](OCC)(=O)CCO", "[P](OCC)(=O)OCC", 0.5),
	Substitution(['(CH2)16Me'], '[CH2]CCCCCCCCCCCCCCCC', "[CH2]CCCCCCCCCCCCCCCC", 0.3),
	Substitution(['(CH2)11Me'], '[CH2]CCCCCCCCCCC', "[CH2]CCCCCCCCCCC", 0.3),
	Substitution(['N(H)Et','Et(H)N'], '[NH]CC', '[NH]CC', 0.5),
	Substitution(['N(H)Me','Me(H)N'], '[NH]C', '[NH]C', 0.5),



	]
	ABBREVIATIONS = {abbrv: sub for sub in SUBSTITUTIONS for abbrv in sub.abbrvs}


	def extract_abbreviation_key(item):
	if isinstance(item, list):
	while isinstance(item, list):
	item = item[0]
	return item
	return item


	def clean_unpaired_brackets(text):
	#keep paired, del unpared
	result = []
	stack = []
	bracket_pairs = {')': '(', ']': '['}
	opening_brackets = {'(', '['}

	for char in text:
	if char in opening_brackets:
	stack.append(char)
	result.append(char)
	elif char in bracket_pairs:
	if stack and stack[-1] == bracket_pairs[char]:
	stack.pop()
	result.append(char)
	else:
	# 未配对的闭合括号，跳过
	continue
	else:
	result.append(char)
	return ''.join(result)

	# def del_unpairebrackets(opening_brackets):
	# # 移除未配对的开括号
	# keep paired, del unpared
	# result = []
	# stack = []
	# bracket_pairs = {')': '(', ']': '['}
	# opening_brackets = {'(', '['}
	# for char in result:
	# if char in opening_brackets:
	# stack.append(char)
	# elif char in bracket_pairs:
	# if stack and stack[-1] == bracket_pairs[char]:
	# stack.pop()
	# final_result.append(char)
	# else:
	# continue
	# else:
	# final_result.append(char)

	# # 如果仍有未闭合的开括号，移除它们
	# return ''.join(c for c in final_result if not stack or c not in opening_brackets)

	def replace_c1(text):
	# Use negative lookahead to ensure 'C1' isn't followed by another digit
	return re.sub(r'C1(?!\d)', 'Cl', text)
	def transform_formula(formula):
	# 匹配 C 后面的数字和 Hg（允许 Hg 后跟其他元素）
	match = re.match(r'C(\d+)(.?)Hg(.)', formula)
	if not match:
	return formula

	n = int(match.group(1))
	prefix = match.group(2) # Hg 前的部分（如空字符串或其他元素）
	suffix = match.group(3) # Hg 后的部分（如 O2）
	g_new = n * 2 + 1
	return f"C{n}{prefix}H{g_new}{suffix}"
	def Cg_transform_formula(formula):
	# 匹配 C 后面的数字和 Hg（允许 Hg 后跟其他元素）
	match = re.match(r'CgH(\d+)(.*?)', formula)
	if not match:
	return formula

	n = int(match.group(1))
	suffix = match.group(2) # Hg 后的部分（如 O2）
	g_new = (n-1)// 2
	return f"C{g_new}H{n}{suffix}"

	def normalize_ocr_text(text, replacement_map):
	"""Normalize OCR text using the predefined mapping rules"""
	if 'C1'in text:
	text=replace_c1(text)
	if 'Hg' in text:
	text= transform_formula(text)
	if 'Cg' in text:
	text= Cg_transform_formula(text)
	if 'Q' in text:
	pattern = r'Q([A-Z])(\w+)'
	replacement = r'O\1\2'
	text = re.sub(pattern, replacement, text)
	if text in ELEMENTS:
	return text
	#remove space
	if ' ' in text:
	text = text.replace(" ", "")
	if any(c in text for c in '0oO'):
	# Step 1: Replace 'o' or 'O' with '0' when after a digit and before a letter or end of string
	# text = re.sub(r'(?<=[1-9])[oO](?=[a-zA-GI-Z]\|$)', '0', text)
	text = re.sub(r'(?<![CF,CH]\d)[oO](?=[a-zA-GI-Z]\|$)', '0', text)
	if '00' in text: text = re.sub(r'00', 'OO', text)#CH0 to CHO
	# text= re.sub(r'(?<=\d)[oO](?=[a-zA-GI-Z]\|$)', '0', text)
	# Step 2: Replace '0' with 'O' when preceded by a letter or followed by optional digits/commas and a letter
	# pattern = r'(?<=[a-zA-Z])0(?=([a-zA-Z]\|$))'
	if text in ['R20']: return text

	text = re.sub(r'(?<=[a-zA-Z])0(?=([a-zA-Z]\|$))', 'O', text)#CH0 to CHO
	text = re.sub( r'^(0)\|(?<=[a-zA-Z][?\d])0(?=[a-zA-Z0-9]*$\|[a-zA-Z])', 'O', text)#CF20 to CF2O
	# result = re.sub(r'(?<=[a-zA-Z])0\|0(?=[,\d]*[a-zA-Z])', 'O', text)
	# Step 3: Only apply '0' to 'O' replacement if '0' doesn't follow digits 1-9
	# if not re.search(r'[1-9]0', text):
	# text = result
	text=clean_unpaired_brackets(text)
	pattern_n1 = r'^NHR[0-9a-z]$'

	# Your existing text normalization rules
	if text in ['OzN','O2N', 'O,N', 'NOz','NO2', 'NO,', '0;N','02N','N20']: text = 'NO2'
	#jpo
	elif text in ['CHzCH','CH,CH',]:text='CH3CH'
	elif text in ["NHCHzCOOH","NHCH2COOH",]:text='NHCH2COOH'
	elif text in ['CIOC','COCE','ClOC','COCI']:text='COCl'

	elif text in ['CHCOOCHs','CH2COOCH5']:text='CH2COOC2H5'
	#staker
	elif text in ['(t-Bu)','t-Bu']:text='t-Bu'

	#ACS
	elif text in ['SiMe2','Me2Si']:text='SiMe2'
	elif text in ['ArzP(O)','Ar2P(O)']:text='Ar2P(O)'
	elif text in ['P(O)(0Et)2','P(O)(OEt)2']:text='P(O)(OEt)2'
	elif text in ['PhOzS','PhO2S']:text='PhO2S'
	elif text in ['CH3O','CHzO']:text='CH3O'
	elif text in ['NH.HCI','NH,.Hcl']:text='NH2.HCl'

	#CLEF
	elif text in ['2','Z']:text='Z'
	elif text in ['(CH2)m','(CH2)q','(CH2)s']:text='CH2'
	elif text in ['Arl','Ari','Ar2','Ar1',]:text='Ar'
	elif text in [ '"0ls','"ols','S[0]a']:text='S[O]a'
	elif text in ['NHR%','NHR*']:text='NHR8'
	elif text in ['Vv','Vy']:text='Vv'


	elif text in ['N3','NY','Ny']:text='N3'
	elif text in ['C2H52N','N(CH,CH3)2','C;H52N','(C;H5)2N','N(C;Hs)2','N(C;H5)2','(CHzCH2)2N','N(CHCH3)2','(CH3CH2)2N','(C2H52N', '(CHzCH)2N','(C2H5)2N','Et2N']:text='(C2H5)2N'
	elif text in ['(CH3)2N','Me2NH','Me,N','Me2N']:text='Me2N'
	elif text in ['(C;H4O)H','(C2H4O)H']:text='(C2H4O)H'
	elif text in ['(C;H4O)4CH3','(C2H4O)4CH3' ]:text='(C2H4O)4CH3'
	elif text in ['(CH2)16Me' ]:text='(CH2)16Me'
	elif text in ['(CH2)11Me']:text='(CH2)11Me'
	elif text in ['CO2CH2Ph','COOCH2Ph','COOCH,Ph']:text='COOCH2Ph'
	elif text in ['CO2C(CH3)3','(CH3)3CO2C',]:text='CO2C(CH3)3'
	elif text in ['OCH2Ph','OCH,Ph','OCHAPH','OCH;Ph']:text='OCH2Ph'
	elif text in ['(CF2)8H','(CF2)gH','(CF2)sH','CF2sH', 'CF:H','CF2)sH','CF):H' ]: text = '(CF2)8H'
	elif text in ['NHSO,Bu','NHSO2Bu',]: text = 'NHSO2Bu'
	elif text in ['NHSO,CH3','NHSO2CH3','NHSO2Me']: text = 'NHSO2CH3'
	elif text in ['1231','1231','23T', 'l23I']: text = 'l23I'



	elif text in ['CF3','CFs', 'CF,', '13','CF 3','F;C', 'F:C', 'F sC', 'CF', 'CF;', 'CFa', 'FzC', 'CFz']: text = 'F3C'
	elif text in ['OCCl3','Cl3CO',]: text = 'OCCl3'
	elif text in ['CCl3','Cl3C',]: text = 'CCl3'
	elif text in ['F;CN', 'NCF;']: text = 'F3CN'
	elif text in ['NCH3','NHCH3', 'NCH;','CH3N','MeN','MeNH']: text = 'NCH3'
	elif text in ['NOMe']:text='NOMe'
	elif text in ['R,R,N']: text = 'R1R2N'
	elif text in ['HzC','HyC','CHy','CHE','H3C.','1;.C', '1;C', 'M e','Mé', 'CH 3', 'CH:', 'HsC', 'HaC', 'H3C', 'CH3', 'CHa', 'H;C', 'CH,', 'CHs', 'CH;']: text = 'Me'
	# elif text in ['CH2']: text = 'C'
	elif text in ['PhzBr']: text = 'Ph3Br'
	elif text in ['PPh3', 'PPha']: text = 'PPh3'
	elif text in ['Et', 'CH,CH3','Catls','Cafls','CH2CH3','H3CH2C','C:H5','HzCH2C','H3CH2C', 'C,H5', 'CzH5','C2H5','C2Hs']: text = 'CH2CH3'
	elif text in ['Ovle', 'HzCO','OCH', 'OCH:','H2CO', 'CH3O', 'CH,O', 'HsCO','OMe','AME', 'AMe','H3CO', 'MeO']: text = 'OMe'
	elif text in ['OCHa','HgCO', 'OCH','HaCO', 'OCH:','H2CO', 'CH3O', 'CH,O', 'OMe','AME', 'AMe', 'MeO']: text = 'OMe'
	elif text in ['SO2Cl', 'SOzCl']: text = 'SO2Cl'
	elif text in ['SO2F', 'SOzF']: text = 'SO2F'
	elif text in ['SONH', 'HNOS','SON', 'SO2NH']: text = 'SO2NH'
	elif text in ['HNO2S','NHO2S']: text = 'NHO2S'
	elif text in ['SO2Cl', 'SOzCl']: text = 'SO2Cl'
	elif text in ['SO2F', 'SOzF']: text = 'SO2F'
	elif text in ['SONH', 'HNOS','SON', 'SO2NH']: text = 'SO2NH'
	elif text in ['SO2NH2', 'SO,NH', 'SO:NH2', 'SONH2']: text = 'SO2NH2'
	elif text in ['SOzCF3', 'SO2CF3', 'CF3SO2']: text = 'SO2CF3'
	elif text in ['SOz','O2S', '$02', 'S02','SO,', '62','O:S','SO2']: text = 'SO2'
	elif text in ['H3CO2S','SO2CH3']: text='SO2CH3'
	elif text in ['SO3H','SOsH','SOaH', 'HO3S','SOzH','HOzS']: text = 'SO3H'
	elif text in ['MeO2SO','OSO2CH3','OSO2Me']:text='OSO2Me'
	elif text in ['MeO2SHN','NHSO2Me']:text='NHSO2Me'


	elif text in ['PIME', 'PMB']: text = 'PMB'
	elif text in ['1-BU', '-BU', '-Bu', 't-BU','t-Bu']: text = 't-Bu'
	elif text in ['NTS', 'NTs', 'TsN']: text = 'TsN'
	elif text in ['TsO', 'OTs']: text = 'OTs'
	elif text in ['Nz* Cl', "N2+Cl-"]: text = 'N2+Cl-'
	elif text in ['NH3Cl', 'NHzCl','NH;Cl']: text = 'NH3Cl'
	elif text in ['B(OH)2']: text = 'B(OH)2'
	elif text in ['NHAC', 'NHAc']: text = 'NHAc'
	elif text in ['1CO', 'NCO', 'OCN', 'OON']: text = 'OCN'
	elif text in ['COCFs','COCF3', 'COCF s']: text = 'COCF3'
	elif text in ['OCF3', 'OCF 3','OCE', 'OCE:','OCEE', 'F3CO', 'OCF', 'OCF:']: text = 'OCF3'
	elif text in ['SCF3', 'SCE', 'SCEE', 'F3CS', 'SCF', 'SCF:']: text = 'SCF3'
	elif text in ['HzCS', 'SCH3', 'SMe','MeS','H3SC' ]: text = 'SMe'
	elif text in ['CHzCHzO', 'CH3CH2O','H5C2O','OC2H5']: text = 'OEt'
	elif text in ['CO,Et','COzEt', 'CO2Et','H3CH2COOC','CO2C2H5']:text = 'CO2Et'
	elif text in ['OTBS', 'TBSO', 'OTBDMS']: text = 'OTBDMS'
	elif text in ['PhO', 'Pho']: text = 'PhO'
	elif text in ['CI', 'C1']: text = 'Cl'
	elif text in ['P h', 'Ph']: text = 'Ph'
	elif text in ['FAHN', 'TFAH,N','TFAH2N',]: text = 'TFAH2N'
	elif text in ['MeaSi', 'Me3Si']: text = 'Me3Si'

	elif text in ['PHzC','PH;C', 'PH3C']: text = 'PH3C'
	elif text in ['COOH','OOOH','1OOC', 'HOOO','HOOC', 'DOOH', 'CO:H','HO,C','CO,H','CO2H']: text = 'CO2H'
	# elif text in ['COO','COO-']: text = 'COO-'#coo-bond
	elif text in ['CO2R','RO2C', 'RO,C','CO2*', "COzR'"]: text = 'CO2R'
	elif text in ['CO2', 'COO','OOC', "COz"]: text = 'CO2'
	#direction matter
	elif text in ['O2C', '02C']: text = 'O2C'
	elif text in ['CaH;', 'CHS', 'C2H5']: text = 'C2H5'
	elif text in ['NHBoc','NHBOc', 'BocHN','BOcHN', "BOCHN"]: text = "NHBoc"
	elif text in ['C7H', 'C7H3']: text = 'C7H3'
	elif text in ['CsH11', 'C5H11']: text = 'C5H11'
	elif text in ['CC3CH2O2C', 'CCl3CH2O2C']: text = 'CCl3CH2O2C'
	elif text in ['CH2OMe','MeOH,C','CH,0Me', 'CH,OMe','MeOH2C']: text = 'CH2OMe'
	elif text in ['R', "R'"]: text = '*'
	elif text in ['U', 'U.']: text = 'U'
	elif text in ['RO']: text = 'O*'
	elif text in ['OAc', 'OAC']: text = 'OAc'
	elif text in ['Rg', 'R9']: text = 'R9'
	elif text in ['OQ', '00', '0Q','OCH3']: text = 'OMe'
	# elif text in ['NH', 'HN']: text = '[NH]'
	elif text in ['NH', 'HN', "NH2", 'H2N', 'H,N']: text = 'N'
	elif text in ['OH', 'HO', 'OH2', '0']: text = 'O'
	elif text in ['N(H)Et','Et(H)N']: text = 'N(H)Et'
	elif text in ['N(H)Me','Me(H)N']: text = 'N(H)Me'
	elif text in ['HNOC','CONH']: text='CONH'
	elif text in ['HNOCCH3','CH,CONH','CH3CONH']: text='CH3CONH'
	elif text in ['PPh2','Ph,P','Ph2P']: text='PPh2'
	elif text in ['SF5','F5S']: text = 'SF5'
	elif text in ['OCH2CF3','F3CH2CO']: text = 'OCH2CF3'
	elif text in ['NHCbz','CbzHN']: text = 'NHCbz'
	elif text in ['NHNH2','H2NHN']: text = 'NHNH2'
	elif text in ['CHzCH22N','N2(CH2CH3)','(CH3CH2)2N']: text = '(CH3CH2)2N'
	#NOTE this with 3 neibor bonds, whic in x order, direction matters
	elif text in ['CHCHCH2CH-3','CH2CH2CH2CH']: text = 'CH2CH2CH2CH'
	elif text in ['HCH2CH2CH2C','HCH2CH2CH2C' ]: text = 'HCH2CH2CH2C'

	elif text in ['(HzC)2HC','(H3C)2HC']: text = '(H3C)2HC'
	elif text in ['13CO2SHNH2CH2C','H3CO2SHNH2CH2C','CH2CH2NSO2CH3']: text = 'CH2CH2NSO2CH3'#USPTO
	elif text in ['CgH19','C9H19']: text = 'C9H19'
	elif text in ['(CF2):H','(CF2)8H']: text = '(CF2)8H'

	elif text in ['COOCH3','HzCO2C', 'CO,Me','H3CO2C','CO2CH3','MeOOC','CO2Me','COzMe','MeO2C','MeO,C']: text = 'CO2Me'
	elif text in ['(CHCHO)','CH2CH2O']: text = 'CH2CH2O'
	elif text in ['CO,CysPr','CO2CysPr']: text = 'CO2CysPr'
	elif text in ['CH2CH2C(O)OCHCH3','CH;CH2C(O)OCHCH3']:text='CH2CH2C(O)OCH2CH3'
	elif text in ['H4NOzS','H4NO3S']: text = 'H4NO3S'
	elif text in ['C1OH21','C1oH21','CloH21', 'C10H21']: text = 'C10H21'

	elif text in ['']: text = 'CF2'

	elif text in replacement_map:
	text = replacement_map[text]
	# elif 'NHR' in text or 'RHN' in text:
	# text = NHR_string(text)
	# elif text in ['RHN']: text = 'N*'

	return text





	def C_H_affixExpand(group):
	"""
	Expands CnHm or HmCn chemical group notation into SMILES format.
	Supports formats like C6H11, NHC6H11, H11C6, H11C6HN where H = 2C - 1.
	Returns SMILES string or False if invalid.
	"""
	# Regex patterns
	p_cn_hm = r'^C(\d+)H(\d+)$' # Standalone CnHm (e.g., C6H11)
	p_hm_cn = r'^H(\d+)C(\d+)$' # Standalone HmCn (e.g., H11C6)
	p_prefix = r'^([A-Za-z]+)(C(\d+)H(\d+))$' # Prefix + CnHm (e.g., NHC6H11)
	p_suffix = r'^(C(\d+)H(\d+))([A-Za-z]+)$' # CnHm + Suffix (e.g., C6H11NH)
	p_hm_cn_prefix = r'^([A-Za-z]+)(H(\d+)C(\d+))$' # Prefix + HmCn (e.g., H11C6HN)
	p_hm_cn_suffix = r'^(H(\d+)C(\d+))([A-Za-z]+)$' # HmCn + Suffix (e.g., H11C6NH)

	# 2. Handle CnHm or HmCn with prefix/suffix
	patterns = [
	#pattern, sub_pattern,aff_idx, group_idx, c_idx, h_idx, aff_type
	(p_prefix, p_cn_hm, 1, 2, 3, 4, 'prefix'),
	(p_suffix, p_cn_hm, 4, 1, 2, 3, 'suffix'),
	(p_hm_cn_prefix, p_hm_cn,1, 2, 4, 3, 'prefix'),
	(p_hm_cn_suffix, p_hm_cn, 4, 1, 3, 2, 'suffix')
	]

	# Abbreviation map for common groups
	ABBREVIATIONS2 = {
	'NH': '[NH]', 'HNOC': '[C](=O)[NH]',
	'CONH': '[C](=O)[NH]', 'HN': '[NH]', 'HNO': '[NH]O', 'NO': '[N]=O',
	'COO':'[C](=O)O',
	'CO2':'[C](=O)O',

	}#TODO may need more

	def validate_and_expand(c_count, h_count, prefix=None, suffix=None):
	"""Helper to validate CnHm/HmCn and generate SMILES."""
	if h_count != 2 * c_count + 1: # Check if H = 2C + 1 CmHn
	return False
	# Base SMILES: [CH] for single carbon, or [CH]C...C for multiple
	smiles = '[CH2]C' if c_count == 2 else '[CH2]'+'C' * int(c_count - 1)#NOTE C have to 2n
	print([c_count, h_count, prefix, suffix],'[c_count, h_count, prefix, suffix]')
	if prefix:
	prefix = ABBREVIATIONS2.get(prefix, prefix)
	smiles = prefix + smiles
	if suffix: # Changed from elif to if to handle both prefix and suffix
	suffix = ABBREVIATIONS2.get(suffix, suffix)
	smiles = suffix + smiles #as CmHn are always n=2m+1
	return smiles

	# 1. Handle standalone CnHm or HmCn
	match_cn_hm = re.match(p_cn_hm, group)
	if match_cn_hm:
	c_count, h_count = int(match_cn_hm.group(1)), int(match_cn_hm.group(2))
	return validate_and_expand(c_count, h_count)

	match_hm_cn = re.match(p_hm_cn, group)
	if match_hm_cn:
	h_count, c_count = int(match_hm_cn.group(1)), int(match_hm_cn.group(2))
	return validate_and_expand(c_count, h_count)

	for pattern, sub_pattern,aff_idx, group_idx, c_idx, h_idx, aff_type in patterns:
	match = re.match(pattern, group)
	if match:
	cn_hm = match.group(group_idx)
	affix = match.group(aff_idx) # Other group is prefix/suffix
	c_count = int(match.group(c_idx))
	h_count = int(match.group(h_idx))
	print(cn_hm,affix,c_count,h_count,'cn_hm,affix,c_count,h_count')
	return validate_and_expand(
	c_count, h_count,
	prefix=affix if aff_type == 'prefix' else None,
	suffix=affix if aff_type == 'suffix' else None
	)

	return False

	def N_C_H_expand(group):
	# 使用正则表达式匹配 NHCnHm 中的 n
	match = re.match(r'NHC(\d+)H(\d+)', group)
	match1 = re.match(r'NC(\d+)H(\d+)', group)
	if not match and not match1:
	return False
	# 获取碳原子数
	if match:
	C_count = int(match.group(1))
	H_count = int(match.group(2))
	if match1:
	C_count = int(match1.group(1))
	H_count = int(match1.group(2))
	if H_count== C_count*2 +1 :
	# 构建 SMILES：'[N]' + 'C' * 碳原子数
	smiles = '[N]' + 'C' * C_count
	return smiles

	def C_F_expand(group):
	# 尝试匹配 CnFm 格式 (e.g., C2F5)
	match_cnfm = re.match(r'C(\d+)F(\d+)', group)
	match_cnfm_2 = re.match(r'F(\d+)C(\d+)', group)
	if match_cnfm:
	C_count = int(match_cnfm.group(1))
	F_count = int(match_cnfm.group(2))
	# 验证氟原子数是否符合全氟烷基的规则：F_count = 2 * C_count + 1
	if F_count != 2 * C_count + 1:
	return False
	else:
	# 尝试匹配 CF2CF3 格式 (e.g., CF2CF3, CF2CF2CF3)
	# 匹配一个或多个 CF2 后跟一个 CF3
	match_cfx = re.match(r'(CF2)*CF3$', group)
	if not match_cfx:
	return False
	# 计算碳原子和氟原子数
	cf2_count = group.count('CF2') # 每个 CF2 贡献 1 碳和 2 氟
	C_count = cf2_count + 1 # +1 for the terminal CF3
	F_count = cf2_count * 2 + 3 # Each CF2 has 2F, CF3 has 3F
	# 验证氟原子数是否符合全氟烷基的规则
	if F_count != 2 * C_count + 1:
	return False
	# 构建 SMILES 字符串
	smiles = []
	for i in range(C_count):
	if i < C_count - 1:
	# 前面的碳原子：2个氟原子，形式为 C(F)(F)
	if len(smiles)==0:
	smiles.append('[C](F)(F)')
	else:
	smiles.append('C(F)(F)')
	else:
	# 最后一个碳原子：3个氟原子，形式为 [C](F)(F)(F)
	smiles.append('C(F)(F)(F)')

	# 连接所有部分
	return ''.join(smiles)

	# def C_H_expand(group):
	# """
	# Expands CnHm or HmCn chemical group notation into SMILES format.
	# Supports formats like C18H37HNOC, CONHC3H7, C3H7, H23C11.
	# Returns SMILES string or False if invalid.
	# """
	# # Regex patterns
	# # Regex patterns
	# p_cn_hm = r'^C(\d+)H(\d+)$' # Standalone CnHm (e.g., C6H11)
	# p_hm_cn = r'^H(\d+)C(\d+)$' # Standalone HmCn (e.g., H11C6)
	# p_prefix = r'^([A-Za-z]+)(C\d+H\d+)$' # Prefix + CnHm (e.g., NHC6H11)
	# p_suffix = r'^(C\d+H\d+)([A-Za-z]+)$' # CnHm + Suffix (e.g., C6H11NH)
	# p_hm_cn_prefix = r'^([A-Za-z]+)(H\d+C\d+)$' # Prefix + HmCn (e.g., H11C6HN)
	# p_hm_cn_suffix = r'^(H\d+C\d+)([A-Za-z]+)$' # HmCn + Suffix (e.g., H11C6NH)

	# # Element and suffix replacement map
	# elements = ['S', 'N', 'P', 'C', 'O']
	# keys = [f"{e}{suffix}" for e in elements for suffix in ['R"', "R'", "R", "*"]]
	# replacement_map = {key: f'{key[0]}*' for key in keys}
	# def validate_and_expand(c_count, h_count, prefix=None, suffix=None):
	# """Helper to validate CnHm/HmCn and generate SMILES."""
	# if h_count != 2 * c_count + 1: # Check if valid alkyl group
	# return False
	# smiles = '[CH2]' + 'C' * (c_count - 1)
	# if prefix:
	# prefix = normalize_ocr_text(prefix, replacement_map)
	# smiles = ABBREVIATIONS.get(prefix, prefix) + 'C' * c_count
	# elif suffix:
	# suffix = normalize_ocr_text(suffix, replacement_map)
	# smiles = ABBREVIATIONS.get(suffix, suffix) + 'C' * c_count
	# return smiles

	# # 1. Handle standalone CnHm or HmCn first
	# match_cn_hm = re.match(p_cn_hm, group)
	# if match_cn_hm:
	# c_count, h_count = int(match_cn_hm.group(1)), int(match_cn_hm.group(2))
	# return validate_and_expand(c_count, h_count)

	# match_hm_cn = re.match(p_hm_cn, group)
	# if match_hm_cn:
	# h_count, c_count = int(match_hm_cn.group(1)), int(match_hm_cn.group(2))
	# return validate_and_expand(c_count, h_count)

	# # 2. Handle CnHm or HmCn with prefix/suffix
	# patterns = [
	# (p_prefix, p_cn_hm, 1, 2, 'suffix'),
	# (p_suffix, p_cn_hm, 2, 1, 'prefix'),
	# (p_hm_cn_prefix, p_hm_cn, 1, 2, 'suffix'),
	# (p_hm_cn_suffix, p_hm_cn, 2, 1, 'prefix')
	# ]

	# for pattern, sub_pattern, c_idx, h_idx, aff_type in patterns:
	# match = re.match(pattern, group)
	# if match:
	# cn_hm = match.group(1 if aff_type == 'suffix' else 2)
	# affix = match.group(2 if aff_type == 'suffix' else 1)
	# sub_match = re.match(sub_pattern, cn_hm)
	# if sub_match:
	# c_count = int(sub_match.group(c_idx))
	# h_count = int(sub_match.group(h_idx))
	# return validate_and_expand(
	# c_count, h_count,
	# prefix=affix if aff_type == 'prefix' else None,
	# suffix=affix if aff_type == 'suffix' else None
	# )

	# return False

	import re

	def C_H_expand(group):
	"""
	Expands CnHm or HmCn chemical group notation into SMILES format.
	Supports formats like C18H37HNOC, CONHC3H7, C3H7, H23C11, and (H7C3)2N.
	Returns SMILES string or False if invalid.
	"""
	# Regex patterns
	p_cn_hm = r'^C(\d+)H(\d+)$' # Standalone CnHm (e.g., C6H11)
	p_hm_cn = r'^H(\d+)C(\d+)$' # Standalone HmCn (e.g., H11C6)
	p_prefix = r'^([A-Za-z]+)(C\d+H\d+)$' # Prefix + CnHm (e.g., NHC6H11)
	p_suffix = r'^(C\d+H\d+)([A-Za-z]+)$' # CnHm + Suffix (e.g., C6H11NH)
	p_hm_cn_prefix = r'^([A-Za-z]+)(H\d+C\d+)$' # Prefix + HmCn (e.g., H11C6HN)
	p_hm_cn_suffix = r'^(H\d+C\d+)([A-Za-z]+)$' # HmCn + Suffix (e.g., H11C6NH)

	# New pattern for handling (H7C3)2N format
	p_bracketed_group = r'^$(H(\d+)C(\d+))$(\d+)([A-Za-z]+)$' # Adjusted to handle (H7C3)2N, etc.
	p_reverse_bracketed_group = r'^([A-Za-z]+)$(C(\d+)H(\d+))$(\d+)$' # Handles N(C3H7)2, etc.

	# Element and suffix replacement map
	elements = ['S', 'N', 'P', 'C', 'O']
	keys = [f"{e}{suffix}" for e in elements for suffix in ['R"', "R'", "R", "*"]]
	replacement_map = {key: f'{key[0]}*' for key in keys}

	def validate_and_expand(c_count, h_count, prefix=None, suffix=None):
	"""Helper to validate CnHm/HmCn and generate SMILES."""
	if h_count != 2 * c_count + 1: # Check if valid alkyl group
	return False
	smiles = '[CH2]' + 'C' * (c_count - 1)
	if prefix:
	prefix = normalize_ocr_text(prefix, replacement_map)
	smiles = ABBREVIATIONS.get(prefix, prefix) + 'C' * c_count
	elif suffix:
	suffix = normalize_ocr_text(suffix, replacement_map)
	smiles = ABBREVIATIONS.get(suffix, suffix) + 'C' * c_count
	return smiles

	# 1. Handle standalone CnHm or HmCn first
	match_cn_hm = re.match(p_cn_hm, group)
	if match_cn_hm:
	c_count, h_count = int(match_cn_hm.group(1)), int(match_cn_hm.group(2))
	return validate_and_expand(c_count, h_count)

	match_hm_cn = re.match(p_hm_cn, group)
	if match_hm_cn:
	h_count, c_count = int(match_hm_cn.group(1)), int(match_hm_cn.group(2))
	return validate_and_expand(c_count, h_count)

	# 2. Handle CnHm or HmCn with prefix/suffix
	patterns = [
	(p_prefix, p_cn_hm, 1, 2, 'suffix'),
	(p_suffix, p_cn_hm, 2, 1, 'prefix'),
	(p_hm_cn_prefix, p_hm_cn, 1, 2, 'suffix'),
	(p_hm_cn_suffix, p_hm_cn, 2, 1, 'prefix')
	]

	for pattern, sub_pattern, c_idx, h_idx, aff_type in patterns:
	match = re.match(pattern, group)
	if match:
	cn_hm = match.group(1 if aff_type == 'suffix' else 2)
	affix = match.group(2 if aff_type == 'suffix' else 1)
	sub_match = re.match(sub_pattern, cn_hm)
	if sub_match:
	c_count = int(sub_match.group(c_idx))
	h_count = int(sub_match.group(h_idx))
	return validate_and_expand(
	c_count, h_count,
	prefix=affix if aff_type == 'prefix' else None,
	suffix=affix if aff_type == 'suffix' else None
	)

	base_smiles=False
	# 3. Handle the new (H7C3)2N case TODO may need N2(C3H7)adding
	match_bracketed_group = re.match(p_bracketed_group, group)
	if match_bracketed_group:
	h_count, c_count = int(match_bracketed_group.group(2)), int(match_bracketed_group.group(3))
	prefix = match_bracketed_group.group(5)
	prefix_n = int(match_bracketed_group.group(4))
	print("h_count, c_count,prefix",[h_count, c_count,prefix])
	unit_smi='C'*c_count
	BACKET_SM=f"({unit_smi})"* prefix_n
	base_smiles=f"[{prefix}]{BACKET_SM}"

	# 4. Handle the new N(C3H7)2
	match_reverse_bracketed_group = re.match(p_reverse_bracketed_group, group)
	if match_reverse_bracketed_group:
	c_count, h_count = int(match_reverse_bracketed_group.group(3)), int(match_reverse_bracketed_group.group(4))
	prefix = match_reverse_bracketed_group.group(1)
	prefix_n = int(match_reverse_bracketed_group.group(5))
	print("h_count, c_count,prefix",[h_count, c_count,prefix])
	unit_smi='C'*c_count
	BACKET_SM=f"({unit_smi})"* prefix_n
	base_smiles=f"[{prefix}]{BACKET_SM}"

	if base_smiles:
	# If valid, return the SMILES with the appropriate number of repetitions for the group
	return f"{base_smiles}"


	return False


	def C_H_expand2(group):
	"""
	Expands CnHm or HmCn chemical group notation into SMILES format.
	Supports formats like C6H11, NHC6H11, H11C6, H11C6HN where H = 2C - 1.
	Returns SMILES string or False if invalid.
	"""
	# Regex patterns
	p_cn_hm = r'^C(\d+)H(\d+)$' # Standalone CnHm (e.g., C6H11)
	p_hm_cn = r'^H(\d+)C(\d+)$' # Standalone HmCn (e.g., H11C6)
	p_prefix = r'^([A-Za-z]+)(C\d+H\d+)$' # Prefix + CnHm (e.g., NHC6H11)
	p_suffix = r'^(C\d+H\d+)([A-Za-z]+)$' # CnHm + Suffix (e.g., C6H11NH)
	p_hm_cn_prefix = r'^([A-Za-z]+)(H\d+C\d+)$' # Prefix + HmCn (e.g., H11C6HN)
	p_hm_cn_suffix = r'^(H\d+C\d+)([A-Za-z]+)$' # HmCn + Suffix (e.g., H11C6NH)

	# Abbreviation map for common groups
	ABBREVIATIONS2 = {
	'NH': '[NH]', 'CONH': '[C](=O)[NH]', 'HN': '[NH]', 'HNO': '[NH]O', 'NO': '[N]=O'
	}#TODO may need more

	def validate_and_expand(c_count, h_count, prefix=None, suffix=None):
	"""Helper to validate CnHm/HmCn and generate SMILES."""
	if h_count != 2 * c_count - 1: # Check if H = 2C - 1
	return False
	if c_count % 2 != 0:
	print(f"C#C , c_count have to be 2n!!!")
	return False
	# Base SMILES: [CH] for single carbon, or [CH]C...C for multiple
	smiles = '[C]#C unit repeat' if c_count == 2 else '[C]#C'+'C#C' * int(c_count/2 - 1)#NOTE C have to 2n
	if prefix:
	prefix = ABBREVIATIONS2.get(prefix, prefix)
	smiles = prefix + smiles
	if suffix: # Changed from elif to if to handle both prefix and suffix
	suffix = ABBREVIATIONS2.get(suffix, suffix)
	smiles += suffix
	return smiles

	# 1. Handle standalone CnHm or HmCn
	match_cn_hm = re.match(p_cn_hm, group)
	if match_cn_hm:
	c_count, h_count = int(match_cn_hm.group(1)), int(match_cn_hm.group(2))
	return validate_and_expand(c_count, h_count)

	match_hm_cn = re.match(p_hm_cn, group)
	if match_hm_cn:
	h_count, c_count = int(match_hm_cn.group(1)), int(match_hm_cn.group(2))
	return validate_and_expand(c_count, h_count)

	# 2. Handle CnHm or HmCn with prefix/suffix
	patterns = [
	(p_prefix, p_cn_hm, 2, 1, 2, 'prefix'),
	(p_suffix, p_cn_hm, 1, 1, 2, 'suffix'),
	(p_hm_cn_prefix, p_hm_cn, 2, 2, 1, 'prefix'),
	(p_hm_cn_suffix, p_hm_cn, 1, 2, 1, 'suffix')
	]

	for pattern, sub_pattern, group_idx, c_idx, h_idx, aff_type in patterns:
	match = re.match(pattern, group)
	if match:
	cn_hm = match.group(group_idx)
	affix = match.group(3 - group_idx) # Other group is prefix/suffix
	sub_match = re.match(sub_pattern, cn_hm)
	if sub_match:
	c_count = int(sub_match.group(c_idx))
	h_count = int(sub_match.group(h_idx))
	return validate_and_expand(
	c_count, h_count,
	prefix=affix if aff_type == 'prefix' else None,
	suffix=affix if aff_type == 'suffix' else None
	)

	return False


	def H_C_expand(group):
	# 1. 处理 CnHm 在前的格式，例如 'C18H37HNOC'
	match_cn_hm_prefix = re.match(r'(H\d+C\d+)(.+)', group)
	elements = ['S', 'N', 'P', 'C', 'O']
	keys = [f"{e}{suffix}" for e in elements for suffix in ['R"', "R'", "R", "*"]]
	replacement_map = {key: f'{key[0]}*' for key in keys}

	if match_cn_hm_prefix:
	cn_hm = match_cn_hm_prefix.group(1) # e.g., 'C18H37'
	suffix = match_cn_hm_prefix.group(2) # e.g., 'HNOC'
	# 处理 CnHm 部分
	match_cn_hm = re.match(r'H(\d+)C(\d+)', cn_hm)
	if match_cn_hm:
	C_count = int(match_cn_hm.group(1))
	H_count = int(match_cn_hm.group(2))
	if H_count != 2 * C_count + 1:
	return False
	else:
	smiles = '[C]' + 'C' * (C_count - 1)
	if suffix:
	suffix = normalize_ocr_text(suffix, replacement_map)
	suffix_smi=ABBREVIATIONS[suffix].smiles if suffix in ABBREVIATIONS else suffix
	sub_smic=sub_smic=suffix_smi + 'C' * (C_count )
	return sub_smic
	else:
	return smiles
	return False
	# 2. 处理 CnHm 在后的格式，例如 'CONHC3H7'
	match_cn_hm_suffix = re.match(r'(.+)(H\d+C\d+)$', group)
	if match_cn_hm_suffix:
	prefix = match_cn_hm_suffix.group(1) # e.g., 'CONH'
	cn_hm = match_cn_hm_suffix.group(2) # e.g., 'C3H7'
	# 处理 CnHm 部分
	match_cn_hm = re.match(r'H(\d+)C(\d+)', cn_hm)
	if match_cn_hm:
	C_count = int(match_cn_hm.group(1))
	H_count = int(match_cn_hm.group(2))
	# 可选：验证 H_count，例如直链烷基 H_count = 2 * C_count + 1
	if H_count != 2 * C_count + 1:
	return False
	else:
	smiles = '[C]' + 'C' * (C_count - 1)
	if prefix:
	prefix = normalize_ocr_text(prefix, replacement_map)
	prefix_smi=ABBREVIATIONS[prefix].smiles if prefix in ABBREVIATIONS else prefix
	sub_smic=sub_smic=prefix_smi + 'C' * (C_count )
	return sub_smic
	else:
	return smiles
	return False

	# 3. 原有逻辑处理 CnFm 格式 (e.g., C2F5)
	match_cnfm = re.match(r'H(\d+)C(\d+)', group)
	if match_cnfm:
	C_count = int(match_cnfm.group(1))
	F_count = int(match_cnfm.group(2))
	# 验证氟原子数是否符合全氟烷基的规则：F_count = 2 * C_count + 1
	if F_count != 2 * C_count + 1:
	return False
	smiles = '[C]' + 'C' * (C_count - 1)
	return smiles

	def C_F_expand(group):
	# 尝试匹配 CnFm 格式 (e.g., C2F5)
	match_cnfm = re.match(r'C(\d+)F(\d+)', group)
	if match_cnfm:
	C_count = int(match_cnfm.group(1))
	F_count = int(match_cnfm.group(2))
	# 验证氟原子数是否符合全氟烷基的规则：F_count = 2 * C_count + 1
	if F_count != 2 * C_count + 1:
	return False
	else:
	# 尝试匹配 CF2CF3 格式 (e.g., CF2CF3, CF2CF2CF3)
	# 匹配一个或多个 CF2 后跟一个 CF3
	match_cfx = re.match(r'(CF2)*CF3$', group)
	if not match_cfx:
	return False
	# 计算碳原子和氟原子数
	cf2_count = group.count('CF2') # 每个 CF2 贡献 1 碳和 2 氟
	C_count = cf2_count + 1 # +1 for the terminal CF3
	F_count = cf2_count * 2 + 3 # Each CF2 has 2F, CF3 has 3F
	# 验证氟原子数是否符合全氟烷基的规则
	if F_count != 2 * C_count + 1:
	return False
	# 构建 SMILES 字符串
	smiles = []
	for i in range(C_count):
	if i < C_count - 1:
	# 前面的碳原子：2个氟原子，形式为 C(F)(F)
	if len(smiles)==0:
	smiles.append('[C](F)(F)')
	else:
	smiles.append('C(F)(F)')
	else:
	# 最后一个碳原子：3个氟原子，形式为 [C](F)(F)(F)
	smiles.append('[C](F)(F)(F)')

	# 连接所有部分
	return ''.join(smiles)


	# '\|'.join(list(ABBREVIATIONS.keys()))
	original_str ='\|'.join(list(ABBREVIATIONS.keys()))
	escaped_str = original_str.replace('', r'\').replace('(', r'$').replace(')', r'$')

	FORMULA_REGEX_str='(' + escaped_str + '\|R[0-9]*\|[A-Z][a-z]+\|[A-Z]\|[0-9]+\|$\|$)'
	# print(escaped_str)
	# print(FORMULA_REGEX_str)
	FORMULA_REGEX = re.compile(FORMULA_REGEX_str)
	# placeholder_atoms
	def _parse_tokens(tokens: list):
	"""
	Parse tokens of condensed formula into list of pairs `(elt, num)`
	where `num` is the multiplicity of the atom (or nested condensed formula) `elt`
	Used by `_parse_formula`, which does the same thing but takes a formula in string form as input
	"""
	elements = []
	i = 0
	j = 0
	while i < len(tokens):
	if tokens[i] == '(':
	while j < len(tokens) and tokens[j] != ')':
	j += 1
	elt = _parse_tokens(tokens[i + 1:j])
	else:
	elt = tokens[i]
	j += 1
	if j < len(tokens) and tokens[j].isnumeric():
	num = int(tokens[j])
	j += 1
	else:
	num = 1
	elements.append((elt, num))
	i = j
	return elements


	def _parse_formula(formula: str):
	"""
	Parse condensed formula into list of pairs `(elt, num)`
	where `num` is the subscript to the atom (or nested condensed formula) `elt`
	Example: "C2H4O" -> [('C', 2), ('H', 4), ('O', 1)]
	"""
	tokens = FORMULA_REGEX.findall(formula)
	# if ''.join(tokens) != formula:
	# tokens = FORMULA_REGEX_BACKUP.findall(formula)
	return _parse_tokens(tokens)


	def _expand_carbon(elements: list):
	"""
	Given list of pairs `(elt, num)`, output single list of all atoms in order,
	expanding carbon sequences (CaXb where a > 1 and X is halogen) if necessary
	Example: [('C', 2), ('H', 4), ('O', 1)] -> ['C', 'H', 'H', 'C', 'H', 'H', 'O'])
	"""
	expanded = []
	i = 0
	while i < len(elements):
	elt, num = elements[i]
	# skip unreasonable number of atoms
	if num > 100000:
	i += 1; continue
	# expand carbon sequence
	if elt == 'C' and num > 1 and i + 1 < len(elements):
	next_elt, next_num = elements[i + 1]
	if next_num > 100000:
	i += 1; continue
	quotient, remainder = next_num // num, next_num % num
	for _ in range(num):
	expanded.append('C')
	for _ in range(quotient):
	expanded.append(next_elt)
	for _ in range(remainder):
	expanded.append(next_elt)
	i += 2
	# recurse if `elt` itself is a list (nested formula)
	elif isinstance(elt, list):
	new_elt = _expand_carbon(elt)
	for _ in range(num):
	expanded.append(new_elt)
	i += 1
	# simplest case: simply append `elt` `num` times
	else:
	for _ in range(num):
	expanded.append(elt)
	i += 1
	if expanded==[]:
	return False
	else:
	return expanded

	def replace_bracket(match):
	content = match.group(1)
	# 条件1：包含数字或 '+' 或 '-'，保留整个 [content]
	if re.search(r'\d\|\+\|-', content):
	return f'[{content}]'
	# 条件2：仅为 'H'，保留
	elif content == 'H':
	return '[H]'
	# 条件3：字符长度 >=2 且包含 'H'，则去除括号和 H
	elif len(content) >= 2 and 'H' in content:
	return ''.join([ch for ch in content if ch != 'H'])
	# 条件4：其他情况，去掉括号
	else:
	return content

	# return re.sub(r'\[([^\[\]]+)\]', replace_bracket, smi)

	def formula_regex(abbrev):# molscribe way for the combine abbver style
	tokens = FORMULA_REGEX.findall(abbrev)
	# elements=_parse_tokens(tokens)
	abbrev_exp=_expand_carbon(_parse_tokens(tokens))
	if abbrev_exp==[]:
	return False
	else:
	return abbrev_exp

	def _expand_abbreviationMS(abbrev):
	"""
	Expand abbreviation into its SMILES; also converts [Rn] to [n*]
	Used in `_condensed_formula_list_to_smiles` when encountering abbrev. in condensed formula
	"""
	if abbrev in ABBREVIATIONS:
	return ABBREVIATIONS[abbrev].smiles
	# if abbrev in RGROUP_SYMBOLS or (abbrev[0] == 'R' and abbrev[1:].isdigit()):
	if abbrev in RGROUP_SYMBOLS or (abbrev[0] in RGROUP_SYMBOLS and abbrev[1:].isdigit()):
	if abbrev[1:].isdigit():
	return f'[{abbrev[1:]}*]'
	return '*'
	return f'[{abbrev}]'


	def _get_bond_symb(bond_num):
	"""
	Get SMILES symbol for a bond given bond order
	Used in `_condensed_formula_list_to_smiles` while writing the SMILES string
	"""
	if bond_num == 0:
	return '.'
	elif bond_num == 1:
	return ''
	elif bond_num == 2:
	return '='
	elif bond_num == 3:
	return '#'
	else:
	print(f"check this val {bond_num} !!!" )

	return ''
	def _condensed_formula_list_to_smiles(formula_list, start_bond, end_bond=None, direction=None):
	"""
	Converts condensed formula (in the form of a list of symbols) to smiles
	Input:
	`formula_list`: e.g. ['C', 'H', 'H', 'N', ['C', 'H', 'H', 'H'], ['C', 'H', 'H', 'H']] for CH2N(CH3)2
	`start_bond`: # bonds attached to beginning of formula
	`end_bond`: # bonds attached to end of formula (deduce automatically if None)
	`direction` (1, -1, or None): direction in which to process the list (1: left to right; -1: right to left; None: deduce automatically)
	Returns:
	`smiles`: smiles corresponding to input condensed formula
	`bonds_left`: bonds remaining at the end of the formula (for connecting back to main molecule); should equal `end_bond` if specified
	`num_trials`: number of trials
	`success` (bool): whether conversion was successful
	"""
	# `direction` not specified: try left to right; if fails, try right to left
	if direction is None:
	num_trials = 1
	for dir_choice in [1, -1]:
	smiles, bonds_left, trials, success = _condensed_formula_list_to_smiles(formula_list, start_bond, end_bond, dir_choice)
	num_trials += trials
	if success:
	return smiles, bonds_left, num_trials, success
	return None, None, num_trials, False
	assert direction == 1 or direction == -1

	def dfs(smiles, bonds_left, cur_idx, add_idx):
	"""
	`smiles`: SMILES string so far
	`cur_idx`: index (in list `formula`) of current atom (i.e. atom to which subsequent atoms are being attached)
	`cur_flat_idx`: index of current atom in list of atom tokens of SMILES so far
	`bonds_left`: bonds remaining on current atom for subsequent atoms to be attached to
	`add_idx`: index (in list `formula`) of atom to be attached to current atom
	`add_flat_idx`: index of atom to be added in list of atom tokens of SMILES so far
	Note: "atom" could refer to nested condensed formula (e.g. CH3 in CH2N(CH3)2)
	"""
	num_trials = 1
	# end of formula: return result
	if (direction == 1 and add_idx == len(formula_list)) or (direction == -1 and add_idx == -1):
	if end_bond is not None and end_bond != bonds_left:
	return smiles, bonds_left, num_trials, False
	return smiles, bonds_left, num_trials, True

	# no more bonds but there are atoms remaining: conversion failed
	if bonds_left <= 0:
	return smiles, bonds_left, num_trials, False
	to_add = formula_list[add_idx] # atom to be added to current atom
	if not isinstance(to_add, str):
	return smiles, bonds_left, num_trials, False
	if isinstance(to_add, list): # "atom" added is a list (i.e. nested condensed formula): assume valence of 1
	if bonds_left > 1:
	# "atom" added does not use up remaining bonds of current atom
	# get smiles of "atom" (which is itself a condensed formula)
	add_str, val, trials, success = _condensed_formula_list_to_smiles(to_add, 1, None, direction)
	if val > 0:
	add_str = _get_bond_symb(val + 1) + add_str
	num_trials += trials
	if not success:
	return smiles, bonds_left, num_trials, False
	# put smiles of "atom" in parentheses and append to smiles; go to next atom to add to current atom
	result = dfs(smiles + f'({add_str})', bonds_left - 1, cur_idx, add_idx + direction)
	else:
	# "atom" added uses up remaining bonds of current atom
	# get smiles of "atom" and bonds left on it
	add_str, bonds_left, trials, success = _condensed_formula_list_to_smiles(to_add, 1, None, direction)
	num_trials += trials
	if not success:
	return smiles, bonds_left, num_trials, False
	# append smiles of "atom" (without parentheses) to smiles; it becomes new current atom
	result = dfs(smiles + add_str, bonds_left, add_idx, add_idx + direction)
	smiles, bonds_left, trials, success = result
	num_trials += trials
	return smiles, bonds_left, num_trials, success
	# atom added is a single symbol (as opposed to nested condensed formula)
	for val in VALENCES.get(to_add, [1]): # try all possible valences of atom added
	add_str = _expand_abbreviationMS(to_add) # expand to smiles if symbol is abbreviation
	if bonds_left > val: # atom added does not use up remaining bonds of current atom; go to next atom to add to current atom
	if cur_idx >= 0:
	add_str = _get_bond_symb(val) + add_str
	result = dfs(smiles + f'({add_str})', bonds_left - val, cur_idx, add_idx + direction)
	else: # atom added uses up remaining bonds of current atom; it becomes new current atom
	if cur_idx >= 0:
	add_str = _get_bond_symb(bonds_left) + add_str
	result = dfs(smiles + add_str, val - bonds_left, add_idx, add_idx + direction)
	trials, success = result[2:]
	num_trials += trials
	if success:
	return result[0], result[1], num_trials, success
	if num_trials > 10000:
	break
	return smiles, bonds_left, num_trials, False

	cur_idx = -1 if direction == 1 else len(formula_list)
	add_idx = 0 if direction == 1 else len(formula_list) - 1
	return dfs('', start_bond, cur_idx, add_idx)

	def swap_paren_bracket(text):
	# Check if string starts with '('
	if not text.startswith('('):
	return text
	# Pattern: match (...) followed by [...]
	pattern = r'^$(.?)$\[(.?)\]'
	# Find match
	match = re.match(pattern, text)
	if match:
	# Swap the groups: [group2](group1)
	return f'[{match.group(2)}]({match.group(1)})'

	return text

	def convert_ch2_string(s):
	# 匹配 (CH2)后面跟数字或字母的模式
	pattern = r'$CH2$(\d+\|[a-zA-Z]+)'
	match = re.fullmatch(pattern, s)
	if not match:
	return s # 如果不是目标模式，返回原字符串

	suffix = match.group(1)

	if suffix.isdigit():
	n = int(suffix)
	if n == 1:
	return '[CH2]'
	else:
	return '[CH2]' + 'C' * (n - 1)
	else:
	# 处理变量情况，如 (CH2)m
	var = suffix
	print(var,s)
	return s


	def process_string_joinused(s):
	# 检查字符串是否以[]开头
	match = re.match(r'^\[([^\]])\](.)$', s)
	if not match:
	return s # 如果不匹配，直接返回原字符串

	content, rest = match.groups()
	# 计算[]中字符数
	char_count = len(content)

	# 如果字符数大于1且包含H
	if char_count > 1 and 'H' in content:
	# 移除H及其后连续的数字
	new_content = re.sub(r'H\d*', '', content)
	return f'[{new_content}]{rest}'
	return s

	def all_elements_in_dict(lst, dictionary):
	"""
	递归检查列表（可能嵌套）中的所有元素是否都存在于字典的键中

	:param lst: 要检查的列表（可能包含嵌套列表）
	:param dictionary: 要检查的字典
	:return: 如果所有元素都在字典键中返回True，否则返回False
	"""
	for element in lst:
	if isinstance(element, list):
	# 如果是嵌套列表，递归检查
	if not all_elements_in_dict(element, dictionary):
	return False
	else:
	# 如果是普通元素，检查是否在字典键中
	if element not in dictionary:
	return False
	return True

	def expand_cf2_to_smiles(input_string):
	# 正则表达式匹配 (CF2)nX 的模式，X 为任意字母数字字符串
	pattern = r'$CF2$(\d+)([A-Za-z0-9]+)'
	match = re.match(pattern, input_string)
	if not match:
	return input_string
	# 提取数字 n 和末尾的化学基团 X
	n = int(match.group(1))
	tail_group = f"[{match.group(2)}]"
	# 构建 SMILES 字符串
	# 每个 CF2 单元是 [C](F)(F)，重复 n 次，最后接 tail_group
	cf2_unit = 'C(F)(F)'
	smiles = '[C](F)(F)' + cf2_unit * (n-1) + tail_group if n > 0 else tail_group
	return smiles

	def find_repeating_unit_and_smiles(s):
	match = re.fullmatch(r'(.+?)(?:\1)+', s)
	if match:
	unit = match.group(1)
	repeat_count = len(s) // len(unit)
	# 根据重复单元生成SMILES（适当处理CH2 -> C, CF2 -> CF2）
	if unit == "CH2":
	smiles_unit = "C" # CH2 -> C
	smi_init="[CH2]"
	elif unit == "CF2":
	smiles_unit = "C(F)(F)" # CF2保持原样
	smi_init="[C](F)(F)"
	elif unit == "SO2":
	smiles_unit = "S(=O)(=O)" # SO2保持原样
	smi_init="[S](=O)(=O)"
	else:
	smiles_unit,smi_init='',''
	print(f'please add the repateat patter here !!! for: {s}')
	# smiles_unit = unit # 其他单元直接使用
	# 生成最终的SMILES
	smiles = smi_init + smiles_unit * (repeat_count - 1 )

	return smiles, repeat_count, unit
	else:
	return None, 0, None # 如果没有匹配到，则返回None

	def get_smiles_from_symbol(symbol, mol, bonds):
	"""
	Convert symbol (abbrev. or condensed formula) to smiles
	If condensed formula, determine parsing direction and num. bonds on each side using coordinates
	"""
	if symbol in ABBREVIATIONS:
	return ABBREVIATIONS[symbol].smiles
	if symbol in RGROUP_SYMBOLS or (symbol[0] in RGROUP_SYMBOLS and symbol[1:].isdigit()):
	if symbol[1:].isdigit():
	return f'[{symbol[1:]}*]'
	return '*'

	if len(symbol) > 20:
	return None
	smiles=convert_ch2_string(symbol)
	if smiles !=symbol:
	return smiles
	if '(CF2)' in symbol:
	smiles=expand_cf2_to_smiles(symbol)
	return smiles
	smiles, repeat_count, unit = find_repeating_unit_and_smiles(symbol)
	if repeat_count>0:
	return smiles

	#TODO@@@ add as speical case or add function,
	# this is hard encode NOTE fix this next version
	if symbol in ['CH2CH','CHCH2','CH2CH2', 'CH2CH2CH','CH2CH2CH','H2CH2CHC','CHCH2CH2','(CH2)10', 'H2C','CH2',#'CH2CH2NSO2CH3',
	'OCH2CHOHCH2NH','OCH2CHOHCH2','CF2O','OF2C','EtO2CHN','EtO2C',
	'CH2CH2C(O)0CH2CH3','CH2CH2C(O)OCH2CH3','l23I',
	'OCH2CH2OH','OCH2CHCH2CCH3','CH2O',
	'(H4NO)2','SO2NHCH2CH','OCH2CH','OCF2H','COCOOCH2CH3','CH2CH2CH2CH','HCH2CH2CH2C','CF3CF2CF2CF2SO3',
	# 'SO2(CH2)3SO2NHCH2CHCH2OH',
	'(CF2)8H','PH3C','CO','OC',
	'CF2CF2H','NHSO2CH3','CH2CH2C','CH;CH2C(O)0CHCH3','CH2CH2C(O)OCHCH3',
	'NH2','H2N', 'CHO', 'OHC', 'N(SO2CH3)2','CH2CH2O','CH2CH2C(O)OCH2CH3',
	#ACS
	'Ar2P(O)','PhO2S','NHP(O)Ph2','P*Ph3','P+Ph3','NH2.HCl',
	#CLEF
	'S[O]a',
	#USPTO
	'(C3H6O)7CH3','HC','(HC','(CH2CH2CH2CH-)','3(CHCHCHCH272',
	#UOB
	'NHzBrH','NH2BrH',
	#staker
	'(co)','(CO)',
	#JPO
	'CH3CH','CH3CCH3','CH3CO','CH3OCH2','CO2C','CH2CO2CH3',"COCl",
	]:#NOTE this are not passed by _condensed_formula_list_to_smiles function
	#TODO fix me in next version, may be need LLM to track this
	# Substitution(['CHO', 'OHC'], '[CH1](=O)', "[CH1](=O)", 0.5),
	# Substitution(['NH2','H2N'], '[NH2;D1]', "[NH2]", 0.1),
	#TODO symbol2SMILES() need dig ChemDraw
	if symbol in ['CH2CH','CHCH2']:smiles='[CH2][CH]'
	elif symbol in ['PH3C']:smiles='[CH2]P'
	elif symbol in ['l23I']:smiles='[I]'
	elif symbol in ['HC','(HC']:smiles='[CH]'
	elif symbol in ['NHzBrH','NH2BrH']:smiles='[NH2].Br'
	elif symbol in ['(C3H6O)7CH3']:smiles="[O]CCC"+"OCCC"*6+'C'#TODO maybe as function
	elif symbol in ['NH2.HCl']:smiles="[NH2].Cl"
	elif symbol in ['CH2CH2CH2CH','(CH2CH2CH2CH-)']:smiles='[CH2]CC[CH]'
	elif symbol in ['3(CHCHCHCH272', 'CHCHCHCH2']:smiles='[CH]CC[CH2]'
	# elif symbol in ['D']:smiles='[2H]'
	elif symbol in [ 'CH3CH']:smiles='[CH]C'
	elif symbol in [ 'CH2CO2CH3']:smiles='[CH2]C(=O)OC'
	elif symbol in [ 'CO2C']:smiles='[C](=O)O[C]'
	elif symbol in [ 'CH3CCH3']:smiles='[C](C)(C)'
	elif symbol in [ 'CH3CO']:smiles='[C](=O)C'
	elif symbol in [ 'CH3OCH2']:smiles='[CH2]OC'

	elif symbol in [ '(co)','(CO)']:smiles='[C](=O)'
	elif symbol in ['Ar2P(O)']:smiles='[P]()()(=O)'
	elif symbol in ['PhO2S']:smiles='[S](=O)(=O)c1ccccc1'
	elif symbol in ['CO','OC']:smiles='[C](=O)'
	elif symbol in ['CH2O']:smiles='[CH2][O]'
	elif symbol in ['P*Ph3','P+Ph3',]:smiles='[P+](c1ccccc1)(c1ccccc1)(c1ccccc1)'
	elif symbol in ['NHP(O)Ph2']:smiles='[NH]P(=O)(c1ccccc1)c1ccccc1'
	elif symbol in ['CH;CH2C(O)0CHCH3','CH2CH2C(O)OCHCH3']:smiles='[CH2]CC(=O)OCC'
	elif symbol in ['CH2CH2CH','H2CH2CHC','CHCH2CH2']:smiles='[CH2][CH2][CH]'
	elif symbol in ['CH2CH2CH2CH']:smiles='[CH2]CC[CH]'
	elif symbol in ['HCH2CH2CH2C']:smiles='[CH]CC[CH2]'
	elif symbol in ['H2C','CH2']:smiles='[CH2]'
	elif symbol in ['H2CH2C','CH2CH2']:smiles='[CH2][CH2]'
	elif symbol in ['CHO', 'OHC']:smiles="[CH](=O)"
	elif symbol in ['NH2','H2N']:smiles="[NH2]"
	elif symbol in ['(CF2)8H',]:smiles="[C](F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)C(F)(F)"
	elif symbol in ['CH2CH2C(O)OCH2CH3','CH2CH2C(O)0CH2CH3']:smiles='[CH2]CC(=O)OCC'
	elif symbol in ['CF3CF2CF2CF2SO3']:smiles='[S](=O)(=O)([O-])C(F)(F)C(F)(F)C(F)(F)C(F)(F)(F)'
	elif symbol in ['S[O]a']:smiles='[S](=O)'
	elif symbol in ['COCl']:smiles='[C](=O)Cl'



	elif symbol in ['OCF2H']:smiles="[O]C(F)(F)"
	elif symbol in ['CF2O']:smiles="[C](F)(F)[O]"
	elif symbol in ['OF2C']:smiles="[O][C](F)(F)"
	elif symbol in ['CF2CF2H']:smiles="[C](F)(F)C(F)(F)"
	# elif symbol in ['CH2CH2NSO2CH3']:smiles='[CH2]CNS(=O)(C)=O'
	elif symbol in ['CH2CH2O']:smiles='[CH2]CO'
	elif symbol in ['OCH2CH2OH']:smiles='[O]CCO'#NOTE Chemdraw may give some idea
	elif symbol in ['EtO2CHN']:smiles='[N]C(=O)OCC'
	elif symbol in ['OCH2CHOHCH2NH']:smiles='[O]CC(O)CN'
	elif symbol in ['OCH2CHCH2CCH3']:smiles='[O]C[CH]C[C]C'
	elif symbol in ['(H4NO)2']:smiles='[O]NON'
	elif symbol in ['SO2NHCH2CH']:smiles='[S](=O)(=O)NC[CH]'
	elif symbol in ['N(SO2CH3)2']:smiles='[N](S(=O)(=O)C)(S(=O)(=O)C)'
	elif symbol in ['CH2CH2C(O)OCH2CH3']:smiles='[CH2]CC(=O)OCC'
	elif symbol in ['OCH2CH']:smiles='[O]C[CH]'
	elif symbol in ['EtO2C']:smiles='C(=O)OCC'
	elif symbol in ['CH2CH2C']:smiles='[CH2]C[C]'
	elif symbol in ['NHSO2CH3']:smiles='[NH]S(=O)(=O)C'
	elif symbol in ['COCOOCH2CH3']:smiles='C(=O)C(=O)OCC'
	# elif symbol in ['SO2(CH2)3SO2NHCH2CHCH2OH']:smiles='[S](=O)(=O)CCCS(=O)(=O)NC[C]CO'
	# elif symbol in ['H4NO3S']:smiles='[S]NCC'
	# elif symbol in ['(CH2)10','[CH]CCCCCCCCC']:smiles='[CH]CCCCCCCCC'#as in convert_ch2_string()
	else:smiles=None
	return smiles

	total_bonds = int(sum([bond.GetBondTypeAsDouble() for bond in bonds]))#TODO aromtaic bond effect ??
	formula_list = _expand_carbon(_parse_formula(symbol))
	# all_in_dict = all(fl in ABBREVIATIONS for fl in formula_list)
	all_in_dict=all_elements_in_dict(formula_list,ABBREVIATIONS)
	#total_bonds, bonds_left 机制是有问题的，所以需要以上的修补，机制不完善
	smiles, bonds_left, num_trails, success = _condensed_formula_list_to_smiles(formula_list, total_bonds, None)
	# if debug:
	print(f'{[formula_list, total_bonds]} use _condensed_formula_list_to_smiles {success} <<-------\n {smiles}')
	if success:
	smiles=swap_paren_bracket(smiles)
	return smiles
	elif all_in_dict :#NOTE resolve abbv combine
	# smiles=ABBREVIATIONS[formula_list[0]].smiles
	key = extract_abbreviation_key(formula_list[0])
	if key in ABBREVIATIONS:
	smiles = ABBREVIATIONS[key].smiles
	else:
	# raise ValueError(f"Abbreviation {key} not found in ABBREVIATIONS.")
	print(f"Abbreviation {key} not found in ABBREVIATIONS.")
	smiles=''
	for fl_i in range(1,len(formula_list)):
	cur_smi=process_string_joinused(ABBREVIATIONS[formula_list[fl_i]].smiles)
	smiles += cur_smi
	return smiles

	return None

	def abbrev2smile(abbrev,abbrev_exp,mol,idx):

	atom_gost = mol.GetAtomWithIdx(idx)
	bonds_gost = atom_gost.GetBonds()
	sub_smi = get_smiles_from_symbol(abbrev, mol, bonds_gost)

	if sub_smi:
	# print(f"succes expanding {abbrev},{abbrev_exp}\n{sub_smi}\t{idx}")
	return sub_smi
	else:
	print(f"failed expanding {abbrev},{abbrev_exp}\n{sub_smi}\t{idx}")
	return '[*]'

	# if abbrev_exp[0] in ABBREVIATIONS:
	# init_smi=ABBREVIATIONS[abbrev_exp[0]].smiles
	# else:
	# if len(abbrev_exp[0])==1:
	# init_smi=f'[{abbrev_exp[0]}]'
	# else:
	# print(f"{abbrev_exp[0]} @@@formula_regex")
	# init_smi=f'[{abbrev_exp[0]}]'
	# # init_smi=ABBREVIATIONS[abbrev_exp[0]].smiles if abbrev_exp[0] in ABBREVIATIONS else
	# if len(abbrev_exp)==1:
	# sub_smi=init_smi
	# return sub_smi
	# elif len(abbrev_exp)>1:
	# sub_smi=init_smi
	# for i_ in range(1,len(abbrev_exp)):

	# smi_=ABBREVIATIONS[abbrev_exp[i_]].smiles if abbrev_exp[i_] in ABBREVIATIONS else f'[{abbs[i_]}]'
	# smi_2=re.sub(r'\[([^\[\]]+)\]', replace_bracket, smi_)
	# sub_smi +=smi_2#default combine them with single bond TODO fixme ifneed
	# return sub_smi
	# else:
	# return False
	def replace_cg_notation(astr):
	def replacer(match):
	h_count = int(match.group(1))
	c_count = (h_count - 1) // 2
	return f'C{c_count}H{h_count}'

	return re.sub(r'CgH(\d+)', replacer, astr)


	def _expand_abbreviation(abbrev, mol,idx):# ABBREVIATIONS, RGROUP_SYMBOLS, ELEMENTS):
	"""
	Expand abbreviation into its SMILES; also converts [Rn] to [n*].
	"""

	if abbrev in ABBREVIATIONS:
	return ABBREVIATIONS[abbrev].smiles
	# elif sub_smi_HC:return sub_smi_HC
	elif N_C_H_expand(abbrev):return N_C_H_expand(abbrev)
	elif C_F_expand(abbrev):return C_F_expand(abbrev)
	elif C_H_expand2(abbrev):return C_H_expand2(abbrev)
	elif C_H_expand(abbrev):return C_H_expand(abbrev)
	elif C_H_affixExpand(abbrev):return C_H_affixExpand(abbrev)
	# elif abbrev in RGROUP_SYMBOLS or (abbrev[0] == 'R' and abbrev[1:].isdigit()):
	elif abbrev in RGROUP_SYMBOLS or (abbrev[0] in RGROUP_SYMBOLS and abbrev[1:].isdigit()):
	if abbrev[1:].isdigit():
	return f'[{abbrev[1:]}*]'
	elif abbrev in ELEMENTS:
	return f'[{abbrev}]'

	elif formula_regex(abbrev):
	abbrev_exp= formula_regex(abbrev)
	return abbrev2smile(abbrev,abbrev_exp,mol,idx)#last use Molscribe way

	match = re.match(r'^(\d+)?(.*)', abbrev)
	if match:
	numeric_part, remaining_part = match.groups()
	if remaining_part in ELEMENTS:
	return f'[{abbrev}]'
	elif numeric_part:
	return f'[{numeric_part}*]'

	else:
	print(f"fixme !!!@@@@: {abbrev}")

	return '[*]'

	def count_current_bonds(mol, atom_idx):
	"""Count current bonds (including bond order) for an atom."""
	atom = mol.GetAtomWithIdx(atom_idx)
	return sum(bond.GetBondTypeAsDouble() for bond in atom.GetBonds())

	debug_not=True

	def expandABB(mol, ABBREVIATIONS, placeholder_atoms):#, RGROUP_SYMBOLS, ELEMENTS):
	mols = [mol]
	# 逆序遍历 placeholder_atoms，确保删除后不会影响后续索引
	for idx in sorted(placeholder_atoms.keys(), reverse=True) :
	group = placeholder_atoms[idx]
	group_smiles = _expand_abbreviation(group,mol,idx)
	submol = Chem.MolFromSmiles(group_smiles) # 获取官能团的子分子
	try:
	submol_rw = Chem.RWMol(submol) # 转换为可编辑的 RWMol
	except Exception as e:
	print(f"abbver: {group}")
	print(f'try to convert {group_smiles} to sub_mol')
	print(e)
	if debug_not:
	print(f"Failed to convert {group_smiles} to sub_mol, using placeholder [*] instead.")
	submol = Chem.MolFromSmiles('[*]')
	submol_rw = Chem.RWMol(submol)
	else:
	raise e#NOTE use it when debugging with adding abber and fixing rules in det_engine.py

	# 1. 识别 submol 的 anchor atoms（连接点）
	anchor_atoms = [0]#always use the fisrt atom as anchor atom
	for atom in submol_rw.GetAtoms():
	# 具有自由基的原子或标记为连接点的原子（例如 [*]）
	if atom.GetNumRadicalElectrons() > 0 and atom.GetIdx() not in anchor_atoms:# or atom.GetSymbol() == '*':
	anchor_atoms.append(atom.GetIdx())
	# 2. 复制主分子
	new_mol = Chem.RWMol(mol)
	placeholder_idx = idx
	# 3. 记录 placeholder (*) 原子的邻居及其键类型
	bonds_info = []
	for bond in new_mol.GetBonds():
	if bond.GetBeginAtomIdx() == placeholder_idx:
	bonds_info.append({
	"neighbor": bond.GetEndAtomIdx(),
	"bond_type": bond.GetBondType()
	})
	elif bond.GetEndAtomIdx() == placeholder_idx:
	bonds_info.append({
	"neighbor": bond.GetBeginAtomIdx(),
	"bond_type": bond.GetBondType()
	})

	# 4. 断开 placeholder 的所有键
	for bond_info in bonds_info:
	new_mol.RemoveBond(placeholder_idx, bond_info["neighbor"])

	# 5. 删除 placeholder 原子
	new_mol.RemoveAtom(placeholder_idx)

	# 6. 重新计算邻居索引（删除后索引变化）
	adjusted_bonds_info = []
	for bond_info in bonds_info:
	neighbor = bond_info["neighbor"]
	if neighbor < placeholder_idx:
	adjusted_neighbor = neighbor
	else:
	adjusted_neighbor = neighbor - 1 # 索引因删除原子而减 1
	adjusted_bonds_info.append({
	"neighbor": adjusted_neighbor,
	"bond_type": bond_info["bond_type"]
	})

	# 7. 合并 submol
	new_mol = Chem.RWMol(Chem.CombineMols(new_mol, submol_rw))

	# 8. 计算 submol 的 anchor atoms 在合并后的索引
	submol_atom_offset = new_mol.GetNumAtoms() - submol_rw.GetNumAtoms()
	new_anchor_indices = [submol_atom_offset + anchor_idx for anchor_idx in anchor_atoms]

	# 9. 重新连接官能团，使用原始键类型
	if len(new_anchor_indices) == 1:
	# 单连接点情况：所有邻居连接到唯一的 anchor atom
	anchor_idx = new_anchor_indices[0]
	for bond_info in adjusted_bonds_info:
	neighbor = bond_info["neighbor"]
	bond_type = bond_info["bond_type"]
	new_mol.AddBond(neighbor, anchor_idx, bond_type)
	# 重置自由基电子数
	a1 = new_mol.GetAtomWithIdx(neighbor)
	a2 = new_mol.GetAtomWithIdx(anchor_idx)
	a1.SetNumRadicalElectrons(0)
	a2.SetNumRadicalElectrons(0)
	else:
	# # 多连接点情况：先尝试按顺序连接, 如果* 连* 会存在多种合理价态的不同分子情况
	# 多连接点情况：根据邻居数量和 anchor atoms 分配连接
	if len(adjusted_bonds_info) > len(new_anchor_indices):
	print(adjusted_bonds_info,' <---adjusted_bonds_info')
	print(new_anchor_indices,'<---new_anchor_indices')
	# raise ValueError(f"Too many neighbors ({len(adjusted_bonds_info)}) for submol with {len(new_anchor_indices)} anchor atoms.")
	# for i, bond_info in enumerate(adjusted_bonds_info):
	# # 按顺序将邻居连接到 anchor atoms
	# anchor_idx = new_anchor_indices[i % len(new_anchor_indices)]
	# neighbor = bond_info["neighbor"]
	# bond_type = bond_info["bond_type"]
	# new_mol.AddBond(neighbor, anchor_idx, bond_type)
	# # 重置自由基电子数
	# a1 = new_mol.GetAtomWithIdx(neighbor)
	# a2 = new_mol.GetAtomWithIdx(anchor_idx)
	# a1.SetNumRadicalElectrons(0)
	# a2.SetNumRadicalElectrons(0)
	# 跟踪每个 anchor 的当前成键数
	anchor_bond_counts = {idx: new_mol.GetAtomWithIdx(idx).GetTotalValence() for idx in new_anchor_indices}
	print(anchor_bond_counts,'<---anchor_bond_counts')
	# max_valence = {6: 4, 7: 3, 8: 2} # 示例：C=4, N=3, O=2，需根据实际原子类型扩展
	adjusted_bonds_info = sorted(adjusted_bonds_info, key=lambda x: x['neighbor'])
	if mol.GetNumConformers() > 0:#as some mol may not have the conf dispite pass the 2d assign process
	pos_0 = mol.GetConformer().GetAtomPosition(adjusted_bonds_info[0]['neighbor'])
	pos_1 = mol.GetConformer().GetAtomPosition(adjusted_bonds_info[-1]['neighbor'])
	print(pos_0.x,pos_1.x,"xxx",adjusted_bonds_info)
	# if group =='SO2NH':
	# if pos_0.x <pos_1.x:
	# adjusted_bonds_info=[adjusted_bonds_info[-1],adjusted_bonds_info[0]]
	# elif group =='NHO2S':
	# if pos_0.x <pos_1.x:
	# adjusted_bonds_info=[adjusted_bonds_info[-1],adjusted_bonds_info[0]]


	for bond_info in adjusted_bonds_info:
	neighbor = bond_info["neighbor"]
	bond_type = bond_info["bond_type"]
	bond_valence = {Chem.BondType.SINGLE: 1, Chem.BondType.DOUBLE: 2, Chem.BondType.TRIPLE: 3}.get(bond_type, 1)
	# 寻找未饱和的 anchor
	selected_anchor_idx = None
	for anchor_idx in new_anchor_indices:
	atom = new_mol.GetAtomWithIdx(anchor_idx)
	atomic_num = atom.GetAtomicNum()
	current_valence = anchor_bond_counts[anchor_idx]
	max_allowed = max(VALENCES.get( atom.GetSymbol(), [1])) # 默认最大价态为1（可根据需求调整）
	if current_valence + bond_valence <= max_allowed:
	selected_anchor_idx = anchor_idx
	break
	if selected_anchor_idx is None:
	continue # 跳过，当前没有可用的未饱和 anchor
	# 添加键
	new_mol.AddBond(neighbor, selected_anchor_idx, bond_type)
	# 更新成键数
	anchor_bond_counts[selected_anchor_idx] += bond_valence
	# 重置自由基电子数
	a1 = new_mol.GetAtomWithIdx(neighbor)
	a2 = new_mol.GetAtomWithIdx(selected_anchor_idx)
	a1.SetNumRadicalElectrons(0)
	a2.SetNumRadicalElectrons(0)



	# 多连接点情况：先尝试按顺序连接
	# success = False
	# temp_mol = Chem.RWMol(new_mol) # 备份分子
	# try:
	# for i, bond_info in enumerate(adjusted_bonds_info):
	# anchor_idx = new_anchor_indices[i % len(new_anchor_indices)]
	# neighbor = bond_info["neighbor"]
	# bond_type = bond_info["bond_type"]
	# temp_mol.AddBond(neighbor, anchor_idx, bond_type)
	# # 重置自由基电子数
	# a1 = temp_mol.GetAtomWithIdx(neighbor)
	# a2 = temp_mol.GetAtomWithIdx(anchor_idx)
	# a1.SetNumRadicalElectrons(0)
	# a2.SetNumRadicalElectrons(0)
	# # 验证价态
	# Chem.SanitizeMol(temp_mol)
	# new_mol = temp_mol
	# success = True
	# except Chem.rdchem.MolSanitizeException:
	# # 价态不合理，尝试反序 anchor atoms
	# temp_mol = Chem.RWMol(new_mol) # 恢复备份
	# reversed_anchors = new_anchor_indices[::-1] # 反序 anchor atoms
	# try:
	# for i, bond_info in enumerate(adjusted_bonds_info):
	# anchor_idx = reversed_anchors[i % len(reversed_anchors)]
	# neighbor = bond_info["neighbor"]
	# bond_type = bond_info["bond_type"]
	# temp_mol.AddBond(neighbor, anchor_idx, bond_type)
	# # 重置自由基电子数
	# a1 = temp_mol.GetAtomWithIdx(neighbor)
	# a2 = temp_mol.GetAtomWithIdx(anchor_idx)
	# a1.SetNumRadicalElectrons(0)
	# a2.SetNumRadicalElectrons(0)
	# # 验证价态
	# Chem.SanitizeMol(temp_mol)
	# new_mol = temp_mol
	# success = True
	# except Chem.rdchem.MolSanitizeException:
	# print(f"Failed to connect submol with {len(new_anchor_indices)} anchor atoms to {len(adjusted_bonds_info)} neighbors.")
	# raise ValueError("Unable to create valid molecule with either anchor order.")
	# if not success:
	# raise ValueError("Unable to create valid molecule.")

	# 10. 更新主分子
	mol = new_mol
	mols.append(mol)

	# 输出修改后的分子 SMILES
	modified_smiles = Chem.MolToSmiles(mols[-1])
	return mols[-1], modified_smiles


	def is_valid_chem_text(text):
	"""检查化学表达式是否只包含大小写字母、数字和成对括号，且括号成对"""
	if not text:
	return False
	if text.isdigit():
	return False
	# 检查是否只包含大小写字母、数字、括号
	if not re.match(r'^[A-Za-z0-9()]+$', text):
	return False
	# 检查括号是否成对
	stack = []
	for char in text:
	if char == '(':
	stack.append(char)
	elif char == ')':
	if not stack or stack[-1] != '(':
	return False
	stack.pop()
	return len(stack) == 0

	def select_chem_expression(orig_text, orig_score, scaled_text, scaled_score, cropped_img_orig, cropped_img_scaled):
	"""选择更合理的化学表达式"""
	# 计算分数的绝对值差
	score_diff = abs(orig_score - scaled_score)
	if scaled_text in orig_text and orig_text in ABBREVIATIONS:
	print(f'use orig_text as include the sacled and in ABBREVIATIONS {orig_text}')
	return orig_text, orig_score, cropped_img_orig
	elif orig_text in scaled_text and scaled_text in ABBREVIATIONS:
	print(f'use scaled_text as include the orig_text and in ABBREVIATIONS {scaled_text}')
	return scaled_text, scaled_score, cropped_img_scaled

	# 检查两个表达式的有效性
	orig_valid = is_valid_chem_text(orig_text)
	scaled_valid = is_valid_chem_text(scaled_text)

	#other condition here
	# 如果分差大于0.1，选择分数高的
	if score_diff > 0.1:
	if orig_valid and scaled_valid:
	if orig_score >= scaled_score and orig_text:
	return orig_text, orig_score, cropped_img_orig
	elif scaled_text:
	return scaled_text, scaled_score, cropped_img_scaled
	elif orig_valid and not scaled_valid:
	return orig_text, orig_score, cropped_img_orig
	elif scaled_valid and not orig_valid:
	return scaled_text, scaled_score, cropped_img_scaled
	else:
	print(f"Both texts are invalid: orig_text='{orig_text}', scaled_text='{scaled_text}'")
	if orig_score >= scaled_score:
	return orig_text, orig_score, cropped_img_orig
	else:
	return scaled_text, scaled_score, cropped_img_scaled
	# 如果分差小于0.1，选择更合理的化学表达式
	else:
	# 如果只有一个有效，选择有效的
	if orig_valid and not scaled_valid:
	return orig_text, orig_score, cropped_img_orig
	elif scaled_valid and not orig_valid:
	return scaled_text, scaled_score, cropped_img_scaled
	# 如果都有效，比较长度
	elif orig_valid and scaled_valid:
	if orig_text in ABBREVIATIONS and scaled_text not in ABBREVIATIONS:
	if N_C_H_expand(scaled_text) or C_F_expand(scaled_text) or C_H_expand2(scaled_text) or C_H_expand(scaled_text):
	if len(scaled_text)> len(orig_text):
	return scaled_text, scaled_score, cropped_img_scaled
	return orig_text, orig_score, cropped_img_orig
	elif orig_text not in ABBREVIATIONS and scaled_text in ABBREVIATIONS:
	if N_C_H_expand(orig_text) or C_F_expand(orig_text) or C_H_expand2(orig_text) or C_H_expand(orig_text):
	if len(orig_text)> len(scaled_text):
	return orig_text, orig_score, cropped_img_orig
	return scaled_text, scaled_score, cropped_img_scaled
	elif orig_text not in ABBREVIATIONS and scaled_text not in ABBREVIATIONS:
	if len(orig_text) > len(scaled_text):
	return orig_text, orig_score, cropped_img_orig
	else:
	if len(orig_text) == len(scaled_text):
	if orig_score >= scaled_score :
	return orig_text, orig_score, cropped_img_orig
	else:
	return scaled_text, scaled_score, cropped_img_scaled
	return scaled_text, scaled_score, cropped_img_scaled

	elif orig_text in ABBREVIATIONS and scaled_text in ABBREVIATIONS:
	if len(orig_text) >= len(scaled_text):
	return orig_text, orig_score, cropped_img_orig
	else:
	return scaled_text, scaled_score, cropped_img_scaled
	# 如果都不有效，优先选择 orig（若存在）
	elif orig_text:
	return orig_text, orig_score, cropped_img_orig
	elif scaled_text:
	return scaled_text, scaled_score, cropped_img_scaled

	# 默认返回 scaled（若存在）
	return scaled_text, scaled_score, cropped_img_scaled if scaled_text else (None, None, None)

	# def expandABB(mol,ABBREVIATIONS, placeholder_atoms):# RGROUP_SYMBOLS, ELEMENTS):
	# mols = [mol]

	# # Process placeholders in reverse order to avoid index issues
	# for idx in sorted(placeholder_atoms.keys(), reverse=True):
	# group = placeholder_atoms[idx]
	# group_smiles = _expand_abbreviation(group)# ABBREVIATIONS, RGROUP_SYMBOLS, ELEMENTS)

	# try:
	# submol = Chem.MolFromSmiles(group_smiles)
	# if not submol:
	# raise ValueError(f"Invalid SMILES for group {group}: {group_smiles}")
	# submol_rw = RWMol(submol)
	# except Exception as e:
	# print(f"Error processing SMILES for group {group}: {e}")
	# continue

	# # Create a new editable molecule
	# new_mol = RWMol(mol)
	# placeholder_idx = idx

	# # Get neighbors of the placeholder atom
	# neighbors = [nb.GetIdx() for nb in new_mol.GetAtomWithIdx(placeholder_idx).GetNeighbors()]

	# # Identify anchor atoms in submol (atoms marked as [*] or with isotope labels)
	# anchor_atoms = []
	# for atom in submol.GetAtoms():
	# if atom.GetNumRadicalElectrons() > 0:
	# #atom.GetSymbol() == '*' or atom.GetIsotope() > 0:
	# anchor_atoms.append(atom.GetIdx())

	# # Validate number of anchor atoms vs. neighbors
	# if len(anchor_atoms) != len(neighbors):
	# print(f"Warning: Mismatch between anchor atoms ({len(anchor_atoms)}) and neighbors ({len(neighbors)}) for group {group}")
	# print(len(anchor_atoms), len(neighbors))
	# if len(anchor_atoms)==0:
	# anchor_atoms.append(0)# use the first atom of submol as default such as PPh3


	# # Remove bonds involving the placeholder atom
	# bonds_to_remove = [(bond.GetBeginAtomIdx(), bond.GetEndAtomIdx())
	# for bond in new_mol.GetBonds()
	# if bond.GetBeginAtomIdx() == placeholder_idx or bond.GetEndAtomIdx() == placeholder_idx]
	# for bond in bonds_to_remove:
	# new_mol.RemoveBond(bond[0], bond[1])

	# # Remove the placeholder atom
	# new_mol.RemoveAtom(placeholder_idx)

	# # Adjust neighbor indices after atom removal
	# new_neighbors = [n - 1 if n > placeholder_idx else n for n in neighbors]

	# # Combine molecules
	# new_mol = RWMol(CombineMols(new_mol, submol_rw))

	# # Connect anchor atoms to neighbors
	# submol_offset = new_mol.GetNumAtoms() - submol.GetNumAtoms()
	# for anchor_idx, neighbor_idx in zip(anchor_atoms, new_neighbors):
	# new_anchor_idx = submol_offset + anchor_idx
	# new_mol.AddBond(neighbor_idx, new_anchor_idx, Chem.BondType.SINGLE)

	# # Reset radical electrons
	# new_mol.GetAtomWithIdx(neighbor_idx).SetNumRadicalElectrons(0)
	# new_mol.GetAtomWithIdx(new_anchor_idx).SetNumRadicalElectrons(0)

	# mol = new_mol
	# mols.append(mol)

	# # Generate final SMILES
	# try:
	# modified_smiles = Chem.MolToSmiles(mols[-1])
	# except Exception as e:
	# print(f"Error generating SMILES: {e}")
	# return mols[-1], None

	# return mols[-1], modified_smiles


	# def _expand_abbreviation(abbrev):
	# """
	# Expand abbreviation into its SMILES; also converts [Rn] to [n*]
	# Used in `_condensed_formula_list_to_smiles` when encountering abbrev. in condensed formula
	# """
	# if abbrev in ABBREVIATIONS:
	# return ABBREVIATIONS[abbrev].smiles
	# elif abbrev in RGROUP_SYMBOLS or (abbrev[0] == 'R' and abbrev[1:].isdigit()):

	# if abbrev[1:].isdigit():
	# return f'[{abbrev[1:]}*]'
	# elif abbrev in ELEMENTS:#ocr tool need this
	# return f'[{abbrev}]'
	# # try abbrev

	# match = re.match(r'^(\d+)?(.*)', abbrev)
	# if match:
	# numeric_part, remaining_part = match.groups()
	# if remaining_part in ELEMENTS:
	# return f'[{abbrev}]'
	# else:
	# if numeric_part:
	# abbrev=f'[{numeric_part}*]'
	# return '[*]'



	# def expandABB(mol,ABBREVIATIONS, placeholder_atoms):
	# mols = [mol]
	# # *第三步: 替换并合并官能团**
	# # 逆序遍历 placeholder_atoms，确保删除后不会影响后续索引
	# for idx in sorted(placeholder_atoms.keys(), reverse=True):
	# group = placeholder_atoms[idx] # 获取官能团名称
	# # print(idx, group)
	# group=_expand_abbreviation(group)
	# submol = Chem.MolFromSmiles(group) # 获取官能团的子分子
	# submol_rw = RWMol(submol) # 让 submol 变成可编辑的 RWMol
	# anchor_atom_idx = 0 # 选择 `submol` 的第一个原子作为连接点 as defined in ABBREVIATIONS
	# # 1. 复制主分子
	# new_mol = RWMol(mol)
	# # *2. 计算 `` 在 `new_mol` 中的索引**
	# placeholder_idx = idx
	# # *3. 记录 `` 原子的邻居**
	# neighbors = [nb.GetIdx() for nb in new_mol.GetAtomWithIdx(placeholder_idx).GetNeighbors()]
	# # *4. 断开 `` 的所有键**
	# bonds_to_remove = [] # 记录要断开的键
	# for bond in new_mol.GetBonds():
	# if bond.GetBeginAtomIdx() == placeholder_idx or bond.GetEndAtomIdx() == placeholder_idx:
	# bonds_to_remove.append((bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()))
	# for bond in bonds_to_remove:
	# new_mol.RemoveBond(bond[0], bond[1])
	# # *5. 删除 `` 原子**
	# new_mol.RemoveAtom(placeholder_idx)
	# # 6. 重新计算 `neighbors`（删除后索引变化）
	# new_neighbors = []
	# for neighbor in neighbors:
	# if neighbor < placeholder_idx:
	# new_neighbors.append(neighbor)
	# else:
	# new_neighbors.append(neighbor - 1) # 因为删除了一个原子，所有索引 -1
	# # 7. 合并 `submol`
	# new_mol = RWMol(CombineMols(new_mol, submol_rw))

	# # 8. 计算 `submol` 的第一个原子在合并后的位置
	# new_anchor_idx = new_mol.GetNumAtoms() - len(submol_rw.GetAtoms()) + anchor_atom_idx

	# # 9. 重新连接官能团
	# for neighbor in new_neighbors:
	# # print(neighbor, new_anchor_idx, "!!")
	# new_mol.AddBond(neighbor, new_anchor_idx, Chem.BondType.SINGLE)
	# a1=new_mol.GetAtomWithIdx(neighbor)
	# a2=new_mol.GetAtomWithIdx(new_anchor_idx)
	# a1.SetNumRadicalElectrons(0)
	# a2.SetNumRadicalElectrons(0)## 将自由基电子数设为 0,as has added new bond
	# # 10. 更新主分子
	# mol = new_mol
	# mols.append(mol)
	# # 输出修改后的分子 SMILES
	# modified_smiles = Chem.MolToSmiles(mols[-1])
	# # print(f"修改后的分子 SMILES: {modified_smiles}")
	# return mols[-1], modified_smiles





	# Helper function to check if two boxes overlap
	def boxes_overlap(box1, box2):
	x1, y1, x2, y2 = box1
	bx1, by1, bx2, by2 = box2
	return not (x2 < bx1 or x1 > bx2 or y2 < by1 or y1 > by2)

	def boxes_overlap2(atombonx, bondbox):
	"""
	检查两个矩形框是否重叠，并返回 bondbox 中不重叠一端到中心 10% 位置的坐标。

	参数:
	atombonx: tuple (x1, y1, x2, y2) 表示原子框的坐标
	bondbox: tuple (bx1, by1, bx2, by2) 表示键框的坐标

	返回:
	tuple (x, y) 表示 bondbox 不重叠一端到中心 80% 位置的坐标，如果完全包含返回 (None, None)
	"""
	x1, y1, x2, y2 = atombonx
	bx1, by1, bx2, by2 = bondbox

	# 计算 bond_box 的中心坐标
	bond_center_x = (bx1 + bx2) / 2
	bond_center_y = (by1 + by2) / 2

	# 辅助函数：计算点到 atom_box 中心的距离
	def distance_to_center(x, y):
	center_x = (x1 + x2) / 2
	center_y = (y1 + y2) / 2
	return ((x - center_x) 2 + (y - center_y) 2) ** 0.5

	# 辅助函数：计算从中心到端点 80% 位置的坐标
	def get_80_percent_point(far_x, far_y):
	# 从中心到端点的向量，按 80% 比例缩放
	dx = far_x - bond_center_x
	dy = far_y - bond_center_y
	new_x = bond_center_x + 0.7 * dx#let added H close to the heavy neighbor
	new_y = bond_center_y + 0.7 * dy
	return new_x, new_y

	# 检查是否完全不相交
	if (bx2 < x1 or bx1 > x2 or by2 < y1 or by1 > y2):
	# 完全不相交，返回较远一端到中心 80% 位置
	dist1 = distance_to_center(bx1, by1)
	dist2 = distance_to_center(bx2, by2)
	far_x, far_y = (bx2, by2) if dist2 > dist1 else (bx1, by1)
	return get_80_percent_point(far_x, far_y)

	# 检查是否完全包含在 atom_box 内
	if (bx1 >= x1 and bx2 <= x2 and by1 >= y1 and by2 <= y2):
	# bondbox 完全在 atom_box 内，无法确定不重叠部分，返回 bond_center_x, bond_center_y
	# return None, None
	return bond_center_x, bond_center_y

	# 检查一端是否在 atom_box 内
	if (bx1 >= x1 and bx1 <= x2 and by1 >= y1 and by1 <= y2):
	# bx1, by1 在 atom_box 内，返回 bx2, by2 到中心 80% 位置
	return get_80_percent_point(bx2, by2)
	elif (bx2 >= x1 and bx2 <= x2 and by2 >= y1 and by2 <= y2):
	# bx2, by2 在 atom_box 内，返回 bx1, by1 到中心 80% 位置
	return get_80_percent_point(bx1, by1)

	# 处理部分相交但两端都不在 atom_box 内的情况
	# 返回较远一端到中心 80% 位置
	dist1 = distance_to_center(bx1, by1)
	dist2 = distance_to_center(bx2, by2)
	far_x, far_y = (bx2, by2) if dist2 > dist1 else (bx1, by1)
	return get_80_percent_point(far_x, far_y)


	charge_labels = [19,20,21,22,23]
	def outputbox_update(output,charge_labels,bond_labels,lab2idx):
	bonds_mask = np.array([True if ins in bond_labels else False for ins in output['pred_classes']])
	bond_bbox=output['bbox'][bonds_mask]
	atoms_mask = np.array([True if ins not in bond_labels and ins not in charge_labels else False for ins in output['pred_classes']])
	atom_bbox=output['bbox'][atoms_mask]
	new_atoms=[]
	b_len=3
	single_odd_b2a=dict()
	for bi,bb in enumerate(bond_bbox):
	overlapped_atoms = []
	overlapped_abox=[]
	for ai,aa in enumerate(atom_bbox):
	overlap_flag=boxes_overlap(bb, aa)#TODO use tghe atom bond box overlap get bond atom mapping,then built mol
	if overlap_flag:
	# print(bb, aa,overlap_flag)
	overlapped_atoms.append(ai)
	overlapped_abox.append(aa)
	if len(overlapped_atoms) == 1:
	single_odd_b2a[bi]=overlapped_atoms
	# Compute the non-overlapping part of the bond box to place hydrogen
	non_overlapping_x,non_overlapping_y=boxes_overlap2(overlapped_abox[0], bb)
	new_atom_out={'bbox': np.array([non_overlapping_x - b_len,
	non_overlapping_y - b_len,
	non_overlapping_x + b_len,
	non_overlapping_y + b_len]).reshape(-1,4),
	'bbox_centers': np.array([non_overlapping_x,non_overlapping_y]).reshape(-1,2),
	'scores': np.array([1.0]),
	'pred_classes': np.array([lab2idx['H']])}
	new_atoms.append(new_atom_out)

	output2_=copy.deepcopy(output)
	for boxout in new_atoms:
	for k,arr in boxout.items():
	value_or_row=output2_[k]
	if arr.ndim == 1:
	output2_[k]=np.append(value_or_row, arr)
	elif arr.ndim >= 2:
	output2_[k] = np.concatenate([value_or_row, arr], axis=0)
	else:
	print('errprs, unkown conditions !!!@')
	return output2_, single_odd_b2a


	def remove_unconnected_hydrogens(mol):
	"""
	移除分子中不与重原子相连的氢原子（包括孤立 H 和只连到其他 H 的 H）。

	参数:
	mol: RDKit Mol 对象

	返回:
	移除氢原子后的 RWMol 对象
	"""
	# 转换为可编辑的 RWMol 对象
	molexp = Chem.RWMol(mol)
	to_remove = []

	# 遍历所有原子
	for atom in molexp.GetAtoms():
	if atom.GetSymbol() == 'H': # 只处理氢原子
	neighbors = atom.GetNeighbors()
	# 检查邻居中是否有重原子
	has_heavy_atom = False
	for neighbor in neighbors:
	if neighbor.GetSymbol() != 'H': # 如果邻居不是 H，则是重原子
	has_heavy_atom = True
	break
	# 如果没有重原子邻居，标记为移除
	if not has_heavy_atom:
	to_remove.append(atom.GetIdx())
	# 按索引从大到小排序，避免移除时索引混乱
	to_remove.sort(reverse=True)

	# 移除标记的原子
	for ai in to_remove:
	molexp.RemoveAtom(ai)
	return molexp

	from rdkit import Chem
	from rdkit.Chem import AllChem

	def remove_unconnected_hydrogens2(mol):
	"""
	移除分子中不与重原子相连的氢原子（包括孤立 H 和只连到其他 H 的 H），并返回移除的氢原子坐标。

	参数:
	mol: RDKit Mol 对象

	返回:
	rw_mol: 移除氢原子后的 RWMol 对象
	removed_h_coords: 移除的氢原子的坐标列表 [(x1, y1, z1), (x2, y2, z2), ...]
	"""
	# 转换为可编辑的 RWMol 对象
	rw_mol = Chem.RWMol(mol)
	to_remove = []

	# 获取分子的构象（假设只有一个构象）
	conformer = rw_mol.GetConformer()

	# 存储移除的氢原子坐标
	removed_h_coords = []

	# 遍历所有原子
	for atom in rw_mol.GetAtoms():
	if atom.GetSymbol() == 'H': # 只处理氢原子
	neighbors = atom.GetNeighbors()
	# 检查邻居中是否有重原子
	has_heavy_atom = False
	for neighbor in neighbors:
	if neighbor.GetSymbol() != 'H': # 如果邻居不是 H，则是重原子
	has_heavy_atom = True
	break
	# 如果没有重原子邻居，标记为移除，并记录坐标
	if not has_heavy_atom:
	to_remove.append(atom.GetIdx())
	pos = conformer.GetAtomPosition(atom.GetIdx())
	removed_h_coords.append((pos.x, pos.y, pos.z))
	# 按索引从大到小排序，避免移除时索引混乱
	to_remove.sort(reverse=True)
	# 移除标记的原子
	for ai in to_remove:
	rw_mol.RemoveAtom(ai)

	return rw_mol, removed_h_coords

	def detect_unconnected_hydrogens(mol):
	rw_mol = Chem.RWMol(mol)
	to_remove = []
	# 获取分子的构象（假设只有一个构象）
	conformer = rw_mol.GetConformer()
	# 存储移除的氢原子坐标
	removed_h_coords = []
	# 遍历所有原子
	for atom in rw_mol.GetAtoms():
	if atom.GetSymbol() == 'H': # 只处理氢原子
	neighbors = atom.GetNeighbors()
	# 检查邻居中是否有重原子
	has_heavy_atom = False
	for neighbor in neighbors:
	if neighbor.GetSymbol() != 'H': # 如果邻居不是 H，则是重原子
	has_heavy_atom = True
	break
	# 如果没有重原子邻居，标记为移除，并记录坐标
	if not has_heavy_atom:
	to_remove.append(atom.GetIdx())
	pos = conformer.GetAtomPosition(atom.GetIdx())
	removed_h_coords.append((pos.x, pos.y, pos.z))
	# 按索引从大到小排序，避免移除时索引混乱
	to_remove.sort(reverse=True)
	return to_remove

	def view_box_center2(bond_bbox, bond_centers, bond_scores, bond_classes,overlap_dist_thresh=5.0,
	max_centers_per_box=5,
	plot_view=False,
	):
	"""
	筛选和可视化 bond_bbox 和 bond_centers，处理重叠圆和过多中心的框。

	参数:
	bond_bbox: numpy array, [x1, y1, x2, y2] 格式的框坐标
	bond_centers: numpy array, [x, y] 格式的中心坐标
	bond_scores: numpy array, 得分
	overlap_dist_thresh: float，判断圆重叠的距离阈值（默认为 5 个单位）
	max_centers_per_box: int，一个框内允许的最大中心数（超过则移除）

	返回:
	tuple: (筛选后的 bond_bbox, bond_centers, bond_scores)
	"""
	# 确保输入形状匹配
	assert len(bond_bbox) == len(bond_centers) == len(bond_scores), "Input arrays must have equal length"
	n = len(bond_bbox)
	# Step 1: 处理重叠的 bond_centers（保留得分最高的）
	keep_centers = np.ones(n, dtype=bool) # 标记要保留的中心
	for i in range(n):
	if not keep_centers[i]:
	continue
	for j in range(i + 1, n):
	if not keep_centers[j]:
	continue
	# 计算两个中心之间的欧几里得距离
	dist = np.sqrt(np.sum((bond_centers[i] - bond_centers[j]) ** 2))
	if dist < overlap_dist_thresh:
	# 如果重叠，保留得分较高的
	if bond_scores[i] > bond_scores[j]:
	keep_centers[j] = False
	else:
	keep_centers[i] = False
	# 应用初步筛选
	bond_bbox = bond_bbox[keep_centers]
	bond_centers = bond_centers[keep_centers]
	bond_scores = bond_scores[keep_centers]
	bond_classes= bond_classes[keep_centers]
	n = len(bond_bbox) # 更新数量
	# Step 2: 检查每个框内的中心数量
	keep_boxes = np.ones(n, dtype=bool) # 标记要保留的框
	for i in range(n):
	# 计算框内的中心数量
	x1, y1, x2, y2 = bond_bbox[i]
	centers_in_box = np.sum((bond_centers[:, 0] >= x1) & (bond_centers[:, 0] <= x2) &
	(bond_centers[:, 1] >= y1) & (bond_centers[:, 1] <= y2))
	if centers_in_box > max_centers_per_box:
	keep_boxes[i] = False
	# 应用最终筛选
	final_bond_bbox = bond_bbox[keep_boxes]
	final_bond_centers = bond_centers[keep_boxes]
	final_bond_scores = bond_scores[keep_boxes]
	final_bond_classes= bond_classes[keep_boxes]
	if plot_view:
	# 可视化（可选）
	fig, ax = plt.subplots(figsize=(10, 10))
	for box in final_bond_bbox:
	x1, y1, x2, y2 = box
	width = x2 - x1
	height = y2 - y1
	rect = Rectangle((x1, y1), width, height, linewidth=1, edgecolor='blue', facecolor='none')
	ax.add_patch(rect)
	for center in final_bond_centers:
	circle = Circle(center, radius=5, edgecolor='red', facecolor='none', linewidth=1)
	ax.add_patch(circle)

	# 设置坐标轴范围
	x_min = min(final_bond_bbox[:, 0].min(), final_bond_centers[:, 0].min()) - 10
	x_max = max(final_bond_bbox[:, 2].max(), final_bond_centers[:, 0].max()) + 10
	y_min = min(final_bond_bbox[:, 1].min(), final_bond_centers[:, 1].min()) - 10
	y_max = max(final_bond_bbox[:, 3].max(), final_bond_centers[:, 1].max()) + 10
	ax.set_xlim(x_min, x_max)
	ax.set_ylim(y_min, y_max)

	ax.set_title("Filtered Boxes and Centers")
	ax.set_xlabel("X")
	ax.set_ylabel("Y")
	plt.gca().set_aspect('equal', adjustable='box')
	plt.grid(True, linestyle='--', alpha=0.7)
	# plt.show()
	else:
	fig=None
	return final_bond_bbox, final_bond_centers, final_bond_scores,final_bond_classes,fig

	def calculate_iou(box1, box2):
	"""
	计算两个框的 IoU（Intersection over Union）。

	参数:
	box1, box2: [x1, y1, x2, y2] 格式的框坐标

	返回:
	float: IoU 值
	"""
	x1 = max(box1[0], box2[0])
	y1 = max(box1[1], box2[1])
	x2 = min(box1[2], box2[2])
	y2 = min(box1[3], box2[3])

	intersection = max(0, x2 - x1) * max(0, y2 - y1)
	area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
	area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
	union = area1 + area2 - intersection

	return intersection / union if union > 0 else 0

	def nms_per_class(labels, boxes, scores, iou_thresh=0.5):
	"""
	对每个类别应用 NMS，保留得分最高的框。
	参数:
	labels: numpy array，类别标签
	boxes: numpy array，框坐标 [x1, y1, x2, y2]
	scores: numpy array，得分
	iou_thresh: float，IoU 阈值
	返回:
	dict: 筛选后的输出
	"""
	# 按类别分组
	unique_labels = np.unique(labels)
	kept_indices = []
	for label in unique_labels:
	# 筛选当前类别的框
	class_mask = labels == label
	class_indices = np.where(class_mask)[0]
	class_boxes = boxes[class_mask]
	class_scores = scores[class_mask]

	# 按得分从高到低排序
	order = np.argsort(class_scores)[::-1]
	class_boxes = class_boxes[order]
	class_scores = class_scores[order]
	class_indices = class_indices[order]

	# NMS
	keep = []
	while len(class_scores) > 0:
	# 保留得分最高的框
	keep.append(class_indices[0])
	if len(class_scores) == 1:
	break

	# 计算当前框与其他框的 IoU
	ious = np.array([calculate_iou(class_boxes[0], box) for box in class_boxes[1:]])
	# 保留 IoU 低于阈值的框
	keep_mask = ious < iou_thresh
	class_boxes = class_boxes[1:][keep_mask]
	class_scores = class_scores[1:][keep_mask]
	class_indices = class_indices[1:][keep_mask]

	kept_indices.extend(keep)

	# 根据保留的索引更新输出
	kept_indices = np.array(kept_indices)
	return {
	'labels': labels[kept_indices],
	'boxes': boxes[kept_indices],
	'scores': scores[kept_indices]
	}




	import numpy as np
	def get_overlap_region(box1, box2):
	"""
	Get the overlapping region of two boxes.

	Args:
	box1, box2: [x_min, y_min, x_max, y_max]

	Returns:
	tuple: (x_min, y_min, x_max, y_max) of overlap region, or None if no overlap
	"""
	x1 = max(box1[0], box2[0])
	y1 = max(box1[1], box2[1])
	x2 = min(box1[2], box2[2])
	y2 = min(box1[3], box2[3])

	if x2 <= x1 or y2 <= y1:
	return None # No overlap
	return (x1, y1, x2, y2)

	def are_bond_connected(box1, box2, bond_bboxes, bond_iou_threshold=0.1):
	"""
	Check if two atom boxes are connected by a bond box, with bond center in overlap region.

	Args:
	box1, box2: atom boxes to check
	bond_bboxes: array of bond boxes
	bond_iou_threshold: IoU threshold for initial bond overlap

	Returns:
	bool: True if connected by a bond with center in overlap region
	"""
	# Get the overlap region of the two atom boxes
	overlap_region = get_overlap_region(box1, box2)
	if overlap_region is None:
	return False # No overlap between atom boxes

	ox_min, oy_min, ox_max, oy_max = overlap_region

	for bond_box in bond_bboxes:
	# Preliminary IoU check
	iou1 = calculate_iou(box1, bond_box)
	iou2 = calculate_iou(box2, bond_box)
	if iou1 > bond_iou_threshold and iou2 > bond_iou_threshold:
	# Calculate bond box center
	bond_center_x = (bond_box[0] + bond_box[2]) / 2
	bond_center_y = (bond_box[1] + bond_box[3]) / 2

	# Check if bond center is within the overlap region
	if (ox_min <= bond_center_x <= ox_max and
	oy_min <= bond_center_y <= oy_max):
	return True
	return False

	def calculate_iou(box1, box2):
	"""
	计算两个边界框的 IoU
	box1, box2: [x_min, y_min, x_max, y_max]
	"""
	x1 = max(box1[0], box2[0])
	y1 = max(box1[1], box2[1])
	x2 = min(box1[2], box2[2])
	y2 = min(box1[3], box2[3])

	intersection = max(0, x2 - x1) * max(0, y2 - y1)
	area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
	area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
	union = area1 + area2 - intersection

	return intersection / union if union > 0 else 0

	def nms(atom_bboxes, atom_scores, atom_classes, iou_threshold=0.5):
	"""
	应用非极大值抑制 (NMS)
	atom_bboxes: 列表，包含所有边界框 [x_min, y_min, x_max, y_max]
	atom_scores: 列表，包含每个边界框的置信度
	atom_classes: 列表，包含每个边界框的类别
	iou_threshold: IoU 阈值，用于判断是否抑制
	返回: 保留的边界框、类别和置信度的索引
	"""
	# 按置信度排序，获取索引
	indices = np.argsort(atom_scores)[::-1] # 从高到低排序

	keep_indices = []
	while len(indices) > 0: # 使用 len(indices) 替代 indices.size
	# 保留当前最高置信度的框
	current_idx = indices[0]
	keep_indices.append(current_idx)

	# 计算当前框与其他框的 IoU
	ious = np.array([calculate_iou(atom_bboxes[current_idx], atom_bboxes[idx]) for idx in indices[1:]])
	# 找出 IoU > threshold 的索引（相对于 indices[1:] 的偏移）
	suppress_indices = indices[1:][ious > iou_threshold]
	# 更新 indices，去除当前框和被抑制的框
	indices = np.setdiff1d(indices, np.concatenate(([current_idx], suppress_indices)))
	# 调试信息
	# print(f"Current idx: {current_idx}, rmoved: {suppress_indices}, Remaining: {indices}")
	# print(f"Current idx: {current_idx}, rmoved: {suppress_indices}, IOU: {ious}")

	# 返回保留的框、类别和置信度
	kept_bboxes = np.array([atom_bboxes[i] for i in keep_indices])
	kept_classes = np.array([atom_classes[i] for i in keep_indices])
	kept_scores = np.array([atom_scores[i] for i in keep_indices])

	return kept_bboxes, kept_classes, kept_scores

	def count_bond_overlaps(box, bond_bboxes, bond_iou_threshold=0.1):
	"""
	Count how many bond boxes overlap with an atom box.

	Args:
	box: atom box [x_min, y_min, x_max, y_max]
	bond_bboxes: array of bond boxes
	bond_iou_threshold: IoU threshold for overlap

	Returns:
	int: number of overlapping bond boxes
	"""
	return sum(1 for bond_box in bond_bboxes if calculate_iou(box, bond_box) > bond_iou_threshold)


	def count_bond_overlaps(box, bond_bboxes, bond_iou_threshold=0.01):
	"""Count how many bond boxes overlap with an atom box."""
	return sum(1 for bond_box in bond_bboxes if calculate_iou(box, bond_box) > bond_iou_threshold)

	def count_atom_overlaps(box, all_bboxes, exclude_idx, min_iou=0.01):
	"""Count how many other atom boxes overlap with this box."""
	return sum(1 for i, other_box in enumerate(all_bboxes)
	if i != exclude_idx and calculate_iou(box, other_box) > min_iou)

	def merge_low_iou_boxes(kept_bboxes, kept_classes, kept_scores, bond_bboxes,
	merge_threshold=0.5, score_threshold=0.7, bond_iou_threshold=0.01,
	high_iou_threshold=0.8, large_score_threshold=0.5):
	"""
	Merge or filter boxes with IoU conditions, removing large low-score boxes first.

	Args:
	kept_bboxes: array, atom bounding boxes [x_min, y_min, x_max, y_max]
	kept_classes: array, class labels (e.g., 0 for 'C')
	kept_scores: array, confidence scores
	bond_bboxes: array, bond bounding boxes
	merge_threshold: float, upper IoU threshold for merging
	score_threshold: float, score threshold to preserve boxes
	bond_iou_threshold: float, IoU threshold for bond connectivity
	high_iou_threshold: float, IoU threshold for high-IoU merging
	large_score_threshold: float, score threshold for large box removal (default 0.5)

	Returns:
	tuple: (merged_bboxes, merged_classes, merged_scores)
	"""
	if len(kept_bboxes) <= 1:
	return kept_bboxes, kept_classes, kept_scores

	kept_bboxes = np.array(kept_bboxes)
	kept_classes = np.array(kept_classes)
	kept_scores = np.array(kept_scores)
	bond_bboxes = np.array(bond_bboxes)

	# Step 0: Remove large boxes with low scores, high atom overlaps, and high bond overlaps
	areas = (kept_bboxes[:, 2] - kept_bboxes[:, 0]) * (kept_bboxes[:, 3] - kept_bboxes[:, 1])
	median_area = np.median(areas)
	keep_mask = np.ones(len(kept_bboxes), dtype=bool)

	for i in range(len(kept_bboxes)):
	if kept_scores[i] < large_score_threshold:
	atom_overlaps = count_atom_overlaps(kept_bboxes[i], kept_bboxes, i)
	bond_overlaps = count_bond_overlaps(kept_bboxes[i], bond_bboxes, bond_iou_threshold)
	is_large = areas[i] > median_area # Define "large" as above median
	if is_large and atom_overlaps >= 2 and bond_overlaps >= 3:
	keep_mask[i] = False
	print(f"Removed large low-score box idx {i}: score {kept_scores[i]}, "
	f"area {areas[i]}, atom overlaps {atom_overlaps}, bond overlaps {bond_overlaps}")

	# Filter boxes
	kept_bboxes = kept_bboxes[keep_mask]
	print(f"afterRemoved large low-score atom box::{len(kept_bboxes)} ")
	kept_classes = kept_classes[keep_mask]
	kept_scores = kept_scores[keep_mask]
	if len(kept_bboxes) == 0:
	return np.array([]), np.array([]), np.array([])

	merged_bboxes = []
	merged_classes = []
	merged_scores = []
	used_indices = set()

	# Step 1: Merge boxes with IoU > high_iou_threshold
	i = 0
	while i < len(kept_bboxes):
	if i in used_indices:
	i += 1
	continue

	high_iou_group = [i]
	for j in range(len(kept_bboxes)):
	if j in used_indices or j == i:
	continue
	iou = calculate_iou(kept_bboxes[i], kept_bboxes[j])
	if iou > high_iou_threshold:
	high_iou_group.append(j)

	if len(high_iou_group) > 1:#atom box ovrlaped
	group_scores = kept_scores[high_iou_group]
	max_score_idx = high_iou_group[np.argmax(group_scores)]
	merged_bboxes.append(kept_bboxes[max_score_idx])
	merged_classes.append(kept_classes[max_score_idx])
	merged_scores.append(kept_scores[max_score_idx])
	used_indices.update(high_iou_group)
	print(f"Merged high-IoU (> {high_iou_threshold}) boxes: {high_iou_group}, "
	f"kept index: {max_score_idx}")
	i += 1

	# Step 2: Process remaining boxes
	i = 0
	while i < len(kept_bboxes):
	if i in used_indices:
	i += 1
	continue

	current_indices = [i]
	for j in range(len(kept_bboxes)):
	if j in used_indices or j == i:
	continue
	iou = calculate_iou(kept_bboxes[i], kept_bboxes[j])#IOU between atoms box
	if 0.05 <= iou < merge_threshold:#better detect model with score matters
	#any small IOU between atoms will processed here
	if kept_scores[j]<0.7:
	current_indices.append(j)

	group_indices = current_indices
	group_scores = kept_scores[group_indices]
	group_classes = kept_classes[group_indices]
	group_bboxes = kept_bboxes[group_indices]

	max_score = np.max(group_scores)
	max_score_idx = group_indices[np.argmax(group_scores)]

	if max_score >= score_threshold:
	bond_connected = False
	if len(group_indices) > 1:
	for idx1, idx2 in zip(group_indices[:-1], group_indices[1:]):
	if are_bond_connected(kept_bboxes[idx1], kept_bboxes[idx2],
	bond_bboxes, bond_iou_threshold):
	bond_connected = True
	break
	if bond_connected:
	for idx in group_indices:
	merged_bboxes.append(kept_bboxes[idx])
	merged_classes.append(kept_classes[idx])
	merged_scores.append(kept_scores[idx])
	print(f"Kept all bond-connected boxes: {group_indices}")
	else:
	bond_overlap_counts = [count_bond_overlaps(kept_bboxes[idx], bond_bboxes,
	bond_iou_threshold) for idx in group_indices]
	max_overlaps = max(bond_overlap_counts)
	candidates = [idx for idx, count in zip(group_indices, bond_overlap_counts)
	if count == max_overlaps]
	best_idx = max(candidates, key=lambda idx: kept_scores[idx])
	merged_bboxes.append(kept_bboxes[best_idx])
	merged_classes.append(kept_classes[best_idx])
	merged_scores.append(kept_scores[best_idx])
	# print(f"No bond box overlap, kept box with most bond overlaps: {best_idx}, "
	# f"overlap count: {max_overlaps}")
	else:
	if len(group_indices) == 1:
	merged_bboxes.append(kept_bboxes[i])
	merged_classes.append(kept_classes[i])
	merged_scores.append(kept_scores[i])
	print(f"Merged lower IOU @@ ONLY ONE box {i}")
	else:
	new_bbox = [
	np.min(group_bboxes[:, 0]), # x_min
	np.min(group_bboxes[:, 1]), # y_min
	np.max(group_bboxes[:, 2]), # x_max
	np.max(group_bboxes[:, 3]) # y_max
	]
	merged_bboxes.append(new_bbox)
	merged_classes.append(group_classes[np.argmax(group_scores)])
	merged_scores.append(max_score)
	print(f"Merged low-score boxes: {group_indices}")
	used_indices.update(group_indices)
	i += 1

	print(f"after processs low IOU atom box::{len(merged_bboxes)} ")
	return (np.array(merged_bboxes), np.array(merged_classes), np.array(merged_scores))


	def refine_boxes(atom_bboxes, atom_scores, atom_classes, bond_bboxes,
	nms_iou_threshold=0.5, merge_threshold=0.5, score_threshold=0.5,
	bond_iou_threshold=0.01, high_iou_threshold=0.8):
	"""
	Iteratively apply NMS and merge until the number of boxes stabilizes.

	Args:
	atom_bboxes, atom_scores, atom_classes: Initial atom box data
	bond_bboxes: Bond box data
	nms_iou_threshold, merge_threshold, score_threshold, bond_iou_threshold, high_iou_threshold: Parameters

	Returns:
	tuple: (final_bboxes, final_classes, final_scores)
	"""
	current_bboxes = np.array(atom_bboxes)
	current_classes = np.array(atom_classes)
	current_scores = np.array(atom_scores)
	prev_count = len(current_bboxes) + 1 # Ensure at least one iteration

	iteration = 0
	while len(current_bboxes) < prev_count:
	print(f"\nIteration {iteration}: Starting with {len(current_bboxes)} boxes")
	prev_count = len(current_bboxes)

	# Apply NMS
	kept_bboxes, kept_classes, kept_scores = nms(
	current_bboxes, current_scores, current_classes, iou_threshold=nms_iou_threshold
	)
	print(f"After NMS: {len(kept_bboxes)} boxes")

	# Apply merge_low_iou_boxes
	merged_bboxes, merged_classes, merged_scores = merge_low_iou_boxes(
	kept_bboxes, kept_classes, kept_scores, bond_bboxes,
	merge_threshold=merge_threshold, score_threshold=score_threshold,
	bond_iou_threshold=bond_iou_threshold, high_iou_threshold=high_iou_threshold
	)
	print(f"After merge: {len(merged_bboxes)} boxes")

	# Update for next iteration
	current_bboxes = merged_bboxes
	current_classes = merged_classes
	current_scores = merged_scores
	iteration += 1

	print(f"Converged after {iteration} iterations with {len(current_bboxes)} boxes")
	return current_bboxes, current_scores, current_classes

	def merge_low_iou_boxes_old(kept_bboxes, kept_classes, kept_scores, merge_threshold=0.3):
	"""
	合并 IoU < merge_threshold 的边界框，使用较高 score 的 class
	"""
	if len(kept_bboxes) <= 1:
	return kept_bboxes, kept_classes, kept_scores

	merged_bboxes = []
	merged_classes = []
	merged_scores = []
	used_indices = set()

	for i in range(len(kept_bboxes)):
	if i in used_indices:
	continue

	# 找到 IoU < merge_threshold 的框组
	current_indices = [i]
	for j in range(i + 1, len(kept_bboxes)):
	if j in used_indices:
	continue
	iou = calculate_iou(kept_bboxes[i], kept_bboxes[j])
	if iou < merge_threshold and iou >0.01:
	current_indices.append(j)

	# 获取相关框的 score, class, 和 bbox
	scores = kept_scores[current_indices]
	classes = kept_classes[current_indices]
	bboxes = kept_bboxes[current_indices]

	max_score = np.max(scores)
	max_score_idx = current_indices[np.argmax(scores)]

	if max_score > 0.5:
	# 保留 score 最大的框
	merged_bboxes.append(kept_bboxes[max_score_idx])
	merged_classes.append(kept_classes[max_score_idx])
	merged_scores.append(kept_scores[max_score_idx])
	else:
	# 合并框，取最小和最大坐标
	new_bbox = [
	np.min(bboxes[:, 0]), # x_min
	np.min(bboxes[:, 1]), # y_min
	np.max(bboxes[:, 2]), # x_max
	np.max(bboxes[:, 3]) # y_max
	]
	merged_bboxes.append(new_bbox)
	merged_classes.append(0)#repalce with *
	merged_scores.append(max_score)

	# 标记已使用的索引
	used_indices.update(current_indices)

	# 转换为 NumPy 数组
	merged_bboxes = np.array(merged_bboxes)
	merged_classes = np.array(merged_classes)
	merged_scores = np.array(merged_scores)

	return merged_bboxes, merged_classes, merged_scores

	############################################################################################################################################################
	#molscrbe evaluate
	from SmilesPE.pretokenizer import atomwise_tokenizer

	def canonicalize_smiles(smiles, ignore_chiral=False, ignore_cistrans=False, replace_rgroup=True):
	if type(smiles) is not str or smiles == '':
	return '', False
	if ignore_cistrans:
	smiles = smiles.replace('/', '').replace('\\', '')
	if replace_rgroup:
	tokens = atomwise_tokenizer(smiles)
	for j, token in enumerate(tokens):
	if token[0] == '[' and token[-1] == ']':
	symbol = token[1:-1]
	if symbol[0] == 'R' and symbol[1:].isdigit():
	tokens[j] = f'[{symbol[1:]}*]'
	elif Chem.AtomFromSmiles(token) is None:
	tokens[j] = '*'
	smiles = ''.join(tokens)
	try:
	canon_smiles = Chem.CanonSmiles(smiles, useChiral=(not ignore_chiral))
	success = True
	except:
	canon_smiles = smiles
	success = False
	return canon_smiles, success

	def convert_smiles_to_canonsmiles(
	smiles_list, ignore_chiral=False, ignore_cistrans=False, replace_rgroup=True, num_workers=16):
	with multiprocessing.Pool(num_workers) as p:
	results = p.starmap(canonicalize_smiles,
	[(smiles, ignore_chiral, ignore_cistrans, replace_rgroup) for smiles in smiles_list],
	chunksize=128)
	canon_smiles, success = zip(*results)
	return list(canon_smiles), np.mean(success)

	def tanimoto_similarity(smiles1, smiles2):
	try:
	mol1 = Chem.MolFromSmiles(smiles1)
	mol2 = Chem.MolFromSmiles(smiles2)
	fp1 = Chem.RDKFingerprint(mol1)
	fp2 = Chem.RDKFingerprint(mol2)
	tanimoto = DataStructs.FingerprintSimilarity(fp1, fp2)
	return tanimoto
	except:
	return 0


	def compute_tanimoto_similarities(gold_smiles, pred_smiles, num_workers=16):
	with multiprocessing.Pool(num_workers) as p:
	similarities = p.starmap(tanimoto_similarity, [(gs, ps) for gs, ps in zip(gold_smiles, pred_smiles)])
	return similarities

	class SmilesEvaluator(object):
	def __init__(self, gold_smiles, num_workers=16, tanimoto=False):
	self.gold_smiles = gold_smiles
	self.num_workers = num_workers
	self.tanimoto = tanimoto
	self.gold_smiles_cistrans, _ = convert_smiles_to_canonsmiles(gold_smiles,
	ignore_cistrans=True,
	num_workers=num_workers)
	self.gold_smiles_chiral, _ = convert_smiles_to_canonsmiles(gold_smiles,
	ignore_chiral=True, ignore_cistrans=True,
	num_workers=num_workers)
	self.gold_smiles_cistrans = self._replace_empty(self.gold_smiles_cistrans)
	self.gold_smiles_chiral = self._replace_empty(self.gold_smiles_chiral)

	def _replace_empty(self, smiles_list):
	"""Replace empty SMILES in the gold, otherwise it will be considered correct if both pred and gold is empty."""
	return [smiles if smiles is not None and type(smiles) is str and smiles != "" else "<empty>"
	for smiles in smiles_list]

	def evaluate(self, pred_smiles, include_details=False):
	results = {}
	if self.tanimoto:
	results['tanimoto'] = np.mean(compute_tanimoto_similarities(self.gold_smiles, pred_smiles))
	# Ignore double bond cis/trans
	pred_smiles_cistrans, _ = convert_smiles_to_canonsmiles(pred_smiles,
	ignore_cistrans=True,
	num_workers=self.num_workers)
	results['canon_smiles'] = np.mean(np.array(self.gold_smiles_cistrans) == np.array(pred_smiles_cistrans))
	if include_details:
	results['canon_smiles_details'] = (np.array(self.gold_smiles_cistrans) == np.array(pred_smiles_cistrans))
	# Ignore chirality (Graph exact match)
	pred_smiles_chiral, _ = convert_smiles_to_canonsmiles(pred_smiles,
	ignore_chiral=True, ignore_cistrans=True,
	num_workers=self.num_workers)
	results['graph'] = np.mean(np.array(self.gold_smiles_chiral) == np.array(pred_smiles_chiral))
	# Evaluate on molecules with chiral centers
	chiral = np.array([[g, p] for g, p in zip(self.gold_smiles_cistrans, pred_smiles_cistrans) if '@' in g])
	results['chiral'] = np.mean(chiral[:, 0] == chiral[:, 1]) if len(chiral) > 0 else -1
	return results



	############################################################################################################################################################
	def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module,
	data_loader: Iterable, optimizer: torch.optim.Optimizer,
	device: torch.device, epoch: int, max_norm: float = 0, **kwargs):
	model.train()
	criterion.train()
	metric_logger = MetricLogger(delimiter=" ")
	metric_logger.add_meter('lr', SmoothedValue(window_size=1, fmt='{value:.6f}'))
	# metric_logger.add_meter('class_error', SmoothedValue(window_size=1, fmt='{value:.2f}'))
	header = 'Epoch: [{}]'.format(epoch)
	print_freq = kwargs.get('print_freq', 10)

	ema = kwargs.get('ema', None)
	scaler = kwargs.get('scaler', None)

	for samples, targets in metric_logger.log_every(data_loader, print_freq, header):
	samples = samples.to(device)
	targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

	if scaler is not None:
	with torch.autocast(device_type=str(device), cache_enabled=True):
	outputs = model(samples, targets)

	with torch.autocast(device_type=str(device), enabled=False):
	loss_dict = criterion(outputs, targets)

	loss = sum(loss_dict.values())
	scaler.scale(loss).backward()

	if max_norm > 0:
	scaler.unscale_(optimizer)
	torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)

	scaler.step(optimizer)
	scaler.update()
	optimizer.zero_grad()

	else:
	outputs = model(samples, targets)
	loss_dict = criterion(outputs, targets)

	loss = sum(loss_dict.values())
	optimizer.zero_grad()
	loss.backward()

	if max_norm > 0:
	torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)

	optimizer.step()

	# ema
	if ema is not None:
	ema.update(model)

	loss_dict_reduced = reduce_dict(loss_dict)
	loss_value = sum(loss_dict_reduced.values())

	if not math.isfinite(loss_value):
	print("Loss is {}, stopping training".format(loss_value))
	print(loss_dict_reduced)
	sys.exit(1)

	metric_logger.update(loss=loss_value, **loss_dict_reduced)
	metric_logger.update(lr=optimizer.param_groups[0]["lr"])

	# gather the stats from all processes
	metric_logger.synchronize_between_processes()
	print("Averaged stats:", metric_logger)
	return {k: meter.global_avg for k, meter in metric_logger.meters.items()}



	# @torch.no_grad()
	# def evaluate(model: torch.nn.Module, criterion: torch.nn.Module, postprocessors, data_loader, base_ds, device, output_dir,
	# annot_file=f'/home/jovyan/rt-detr/data/real_processed/CLEF_with_charge/annotations/val.json',
	# outcsv_filename=f'/home/jovyan/rt-detr/rt-detr/output/output_charge_CLEF.csv',
	# ):
	# model.eval()
	# criterion.eval()

	# metric_logger = MetricLogger(delimiter=" ")
	# header = 'Test:'

	# iou_types = postprocessors.iou_types
	# coco_evaluator = CocoEvaluator(base_ds, iou_types)

	# panoptic_evaluator = None

	# # # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
	# # home='/home/jovyan/rt-detr'
	# # dataset = 'CLEF'
	# # annot_file=f'/home/jovyan/rt-detr/data/real_processed/{dataset}_with_charge/annotations/test.json'
	# # outcsv_filename/home/jovyan/rt-detr/rt-detr/output/output_charge_{dataset}.csv'


	# # annot_file=f'/home/jovyan/rt-detr/data/real_processed/{dataset}_with_charge/annotations/test.json'
	# # outcsv_filename=f'/home/jovyan/rt-detr/rt-detr/output/output_charge_{dataset}.csv'
	# with open(annot_file, 'r') as file:
	# data = json.load(file)




	# image_id_to_name = {}

	# for image_data in data['images']:
	# image_id = image_data['id']
	# image_path = image_data['file_name']
	# image_name = os.path.basename(image_path)
	# image_id_to_name[image_id] = image_name

	# res_smiles = []
	# bond_labels = [13,14,15,16,17,18]
	# idx_to_labels={0:'other',1:'C',2:'O',3:'N',4:'Cl',5:'Br',6:'S',7:'F',8:'B',
	# 9:'I',10:'P',11:'H',12:'Si',
	# #bond
	# 13:'single',14:'wdge',15:'dash',
	# 16:'=',17:'#',18:':',#aromatic
	# #charge
	# 19:'-4',20:'-2',
	# 21:'-1',#-
	# 22:'+1',#+
	# 23:'2',
	# }
	# lab2idx={v:k for k,v in idx_to_labels.items()}
	# #indigo bond type stero maping
	# indi_bond={
	# "1":'single', "2":'=',"3":'#',"4":':',"5":'wdge',"6":'dash',
	# }


	# smiles_data = pd.DataFrame({'file_name': [],
	# 'SMILES':[]})

	# output_dict = {}
	# target_dict = {}
	# filtered_output_dict = {}
	# # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
	# for samples, targets in metric_logger.log_every(data_loader, 10, header):
	# samples = samples.to(device)
	# targets = [{k: v.to(device) for k, v in t.items()} for t in targets]

	# outputs = model(samples)

	# orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0)
	# results = postprocessors(outputs, orig_target_sizes)#RTDETRPostProcessor@@src/zoo/rtertr
	# #results: a list of dict label box score
	# res = {target['image_id'].item(): output for target, output in zip(targets, results)}

	# for target, output in zip(targets, results):
	# output_dict[target['image_id'].item()] = output

	# stats = {}
	# # stats = {k: meter.global_avg for k, meter in metric_logger.meters.items()}
	# if coco_evaluator is not None:
	# if 'bbox' in iou_types:
	# # stats['coco_eval_bbox'] = coco_evaluator.coco_eval['bbox'].stats.tolist()
	# stats['coco_eval_bbox'] = coco_evaluator.coco_eval['bbox'].stats
	# if 'segm' in iou_types:
	# stats['coco_eval_masks'] = coco_evaluator.coco_eval['segm'].stats.tolist()



	# # ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
	# # ocr_recognition_only = get_ocr_recognition_only(force_cpu=False)
	# # caption_remover = CaptionRemover(force_cpu=True)
	# for key, value in output_dict.items():#TODO improve here
	# selected_indices = value['scores'] > 0.5#may be >=0.5 cut off, as used the sigmoid?
	# if value['labels'][selected_indices].size(0) != 0:#no good prediction
	# filtered_output_dict[key] = {
	# 'labels': value['labels'][selected_indices],# may be selected_indices ==0 as all small than0.5
	# 'boxes': value['boxes'][selected_indices],
	# 'scores': value['scores'][selected_indices]
	# }
	# else:
	# ima_name=image_id_to_name[key]
	# print(key,"all prediction scores small 0.5!!",len(output_dict),f"{ima_name}")##

	# for i,(key,value) in enumerate(filtered_output_dict.items()):
	# result = []#TODO need a box2mol or graph
	# smi_mol=output_to_smiles(value,idx_to_labels,bond_labels,result)#TODO use the idx_to_labels numer to if --else
	# if smi_mol:
	# res_smiles.append(smi_mol[0]) #TODO check this erro other0
	# else:
	# res_smiles.append('')

	# new_row = {'file_name':image_id_to_name[key], 'SMILES':res_smiles[i]}
	# smiles_data = smiles_data._append(new_row, ignore_index=True)

	# print(f"will save {len(smiles_data)} dataframe into csv")
	# smiles_data.to_csv(outcsv_filename, index=False)

	# return stats, coco_evaluator

	def remove_bond_directions_if_no_chiral(mol):
	# 检查分子是否有效
	if mol is None:
	return None
	# 计算手性中心
	chiral_centers = Chem.FindMolChiralCenters(mol, includeUnassigned=True)
	# 如果没有手性中心，移除单键的立体化学标记
	if not chiral_centers:
	for bond in mol.GetBonds():
	# 只处理单键
	if bond.GetBondType() == Chem.BondType.SINGLE:
	# 移除楔形和虚线标记
	bond.SetBondDir(Chem.BondDir.NONE)
	return mol
	#######################################################################################
	def molExpanding(mol_rebuit,placeholder_atoms,wdbs,bond_dirs,alignmol=False):
	cm=copy.deepcopy(mol_rebuit)
	# print(placeholder_atoms)
	expand_mol, expand_smiles= expandABB(cm,ABBREVIATIONS, placeholder_atoms)
	rdm=copy.deepcopy(expand_mol)
	AllChem.Compute2DCoords(rdm)
	target_mol, ref_mol=rdm, cm

	if alignmol:
	mcs=rdFMCS.FindMCS([target_mol, ref_mol], # larger,small order
	atomCompare=rdFMCS.AtomCompare.CompareAny,
	# bondCompare=rdFMCS.BondCompare.CompareAny,
	ringCompare=rdFMCS.RingCompare.IgnoreRingFusion,
	matchChiralTag=False,
	)
	atommaping_pairs=g_atompair_matches([target_mol, ref_mol],mcs)
	atomMap=atommaping_pairs[0]
	try:
	rmsd2=rdkit.Chem.rdMolAlign.AlignMol(prbMol=target_mol, refMol=ref_mol, atomMap=atomMap,maxIters=2000000)
	except Exception as e:
	print(atomMap,"@@@@")
	print(e)
	#after get atomMap
	c2p={cur:pre for cur, pre in atomMap}
	p2c={pre:cur for cur, pre in atomMap}
	for b in wdbs:#add bond direction
	p0,p1=int(b[0]), int(b[1])#may be not in the atomMap as the mcs_sub
	if p0 in p2c.keys() and p1 in p2c.keys():
	c0,c1=p2c[p0],p2c[p1]
	# print("[pre0,pre1]vs[c0,c1]current atom id",[p0,p1],[c0,c1])
	b_=target_mol.GetBondBetweenAtoms(c0,c1)
	if b_:
	b_.SetBondDir(bond_dirs[b[3]])
	expandStero_smi=Chem.MolToSmiles(target_mol)#directly will not add the stero info into smiles, must have the assing steps
	else:
	expandStero_smi =expand_smiles

	m=target_mol.GetMol()
	# Chem.SanitizeMol(m)
	Chem.DetectBondStereochemistry(m)
	Chem.AssignChiralTypesFromBondDirs(m)
	Chem.AssignStereochemistry(m)#expandStero_smi , m

	return expandStero_smi, m


	def remove_backslash_and_slash(input_string):
	if "\\" in input_string:
	input_string = input_string.replace("\\", "")
	if "/" in input_string:
	input_string = input_string.replace("/", "")

	return input_string


	def remove_number_before_star(input_string):
	result = list(input_string)

	i = 0
	while i < len(result):
	if result[i] == '*' and i!= len(result) -1:
	#c1c()c()c(C()()C(C)C)c()c1* --> c1c()c()c(C()()C(C)C)c()c1*
	j = i - 1
	if result[j-1].isalpha():
	continue
	while j >= 0 and result[j].isdigit():
	result[j] = ''
	j -= 1
	i += 1

	return ''.join(result)

	def remove_SP(input_string):
	pattern = r'\[([^@])@?[A-Z0-9]\]'
	# if "S@SP1" in input_string:
	# input_string = input_string.replace("S@SP1", "S")
	# elif "S@SP2" in input_string:
	# input_string = input_string.replace("S@SP2", "S")
	# elif "S@SP3" in input_string:
	# input_string = input_string.replace("S@SP3", "S")
	input_string = re.sub(r'@SP[1-3]', '', input_string)
	if '@TB' in input_string:
	result = re.sub(pattern, r'[\1]', input_string)
	input_string=result
	return input_string

	def rdkit_canonicalize_smiles(smiles):
	Aad_string = r'([A-Z][a-z]*)([0-9]+)'
	tokens = atomwise_tokenizer(smiles)
	for j, token in enumerate(tokens):
	if token[0] == '[' and token[-1] == ']':
	symbol = token[1:-1]
	# matches = re.findall(Aad_string, symbol)#findall may give not wanted, such as [BH2], shuld not change
	matches = re.match(Aad_string, symbol)
	if matches:
	letters, numbers = matches.groups()
	print(f"{letters} {numbers}")
	# tokens[j] = f'[{symbol[1:]}*]'
	tokens[j] = '*'
	elif symbol in RGROUP_SYMBOLS:# or (symbol[0] in RGROUP_SYMBOLS and abbrev[1:].isdigit()):
	tokens[j] = '*'
	elif Chem.AtomFromSmiles(token) is None:
	tokens[j] = '*'

	smiles = ''.join(tokens)
	try:
	canon_smiles = Chem.CanonSmiles(smiles, useChiral=False)
	success = True
	except:
	canon_smiles = smiles
	success = False
	return canon_smiles, success

	def NoRadical_Smi(smi):
	aa=Chem.MolFromSmiles(smi)
	for atom in aa.GetAtoms():
	if atom.GetNumRadicalElectrons() > 0: # 检查是否有自由基
	# print(f"找到自由基原子: {atom.GetSymbol()}, 自由电子数: {atom.GetNumRadicalElectrons()}")
	# 添加氢原子以去除自由基
	atom.SetNumRadicalElectrons(0) # 将自由电子数设为 0
	# 根据硫原子的化合价调整氢原子数
	atom.SetNumExplicitHs(atom.GetTotalValence() - atom.GetExplicitValence())
	san_before=Chem.MolToSmiles(aa)
	# print(san_before)
	return san_before

	import logging

	def check_and_fix_valence(smiles_or_list):
	"""
	Check atom valences in a SMILES string or a list [smiles, suffix/prefix].
	Fix unusual valences (e.g., N(2)) by adding/removing hydrogens to maintain neutrality.
	Returns: (corrected_smiles_or_list, warnings)
	"""
	# Set up logging
	logging.basicConfig(level=logging.WARNING)
	warnings = []

	# Standard valence dictionary for common atoms
	standard_valences = {
	'C': [4],
	'N': [3], # Prioritize valence 3 for neutral nitrogen (e.g., amines, amides)
	'O': [2],
	'H': [1],
	'F': [1]
	}

	# Handle input: SMILES string or list from C_H_expand
	if isinstance(smiles_or_list, list):
	smiles, other_part = smiles_or_list
	else:
	smiles, other_part = smiles_or_list, None

	# Process main SMILES
	mol = Chem.MolFromSmiles(smiles, sanitize=False) if smiles else None
	if mol is None:
	warnings.append(f"Invalid SMILES: {smiles}")
	return smiles_or_list, warnings

	# Process other_part if it exists and is a valid SMILES
	other_part_mol = None
	if other_part:
	try:
	other_part_mol = Chem.MolFromSmiles(other_part, sanitize=False)
	except:
	pass # other_part may not be valid SMILES (e.g., a suffix/prefix)

	# Helper function to check and fix valence for a molecule
	def process_molecule(mol, is_other_part=False):
	nonlocal warnings
	corrected = False
	prefix = "other_part" if is_other_part else "SMILES"

	# Compute valence explicitly to avoid precondition violation
	mol.UpdatePropertyCache(strict=False)

	# Check valences
	for atom in mol.GetAtoms():
	symbol = atom.GetSymbol()
	valence = atom.GetTotalValence()
	expected_valences = standard_valences.get(symbol, [valence])
	if valence not in expected_valences:
	warnings.append(f"Unusual valence in {prefix} for {symbol}: {valence} (expected {expected_valences})")

	# Fix nitrogen valence issues by adjusting hydrogens
	if any('N' in w for w in warnings if prefix in w):
	rw_mol = Chem.RWMol(mol) # Editable molecule
	for atom in rw_mol.GetAtoms():
	if atom.GetSymbol() != 'N':
	continue
	valence = atom.GetTotalValence()
	if valence < 3:
	# Add hydrogens to reach valence 3
	hydrogens_needed = 3 - valence
	atom.SetNumExplicitHs(atom.GetNumExplicitHs() + hydrogens_needed)
	corrected = True
	elif valence > 3:
	# Remove hydrogens if possible
	hydrogens_to_remove = valence - 3
	current_hydrogens = atom.GetNumExplicitHs()
	if current_hydrogens >= hydrogens_to_remove:
	atom.SetNumExplicitHs(current_hydrogens - hydrogens_to_remove)
	corrected = True
	else:
	warnings.append(f"Cannot reduce N valence in {prefix} to 3 without removing non-H bonds")
	if corrected:
	mol = rw_mol.GetMol()

	# Sanitize molecule after corrections
	if corrected:
	try:
	Chem.SanitizeMol(mol, catchErrors=True)
	return mol, True
	except Exception as e:
	warnings.append(f"Failed to sanitize {prefix} after correction: {str(e)}")
	return mol, False
	return mol, False

	# Process main molecule
	mol, mol_corrected = process_molecule(mol)

	# Convert main molecule back to SMILES
	corrected_smiles = Chem.MolToSmiles(mol) if mol_corrected else smiles

	# Process other_part if it's a valid molecule
	corrected_other_part = other_part
	if other_part_mol:
	other_part_mol, other_corrected = process_molecule(other_part_mol, is_other_part=True)
	corrected_other_part = Chem.MolToSmiles(other_part_mol) if other_corrected else other_part

	# Return based on input type
	if other_part:
	return [corrected_smiles, corrected_other_part], warnings
	return corrected_smiles, warnings

	def molfpsim(original_smiles,test_smiles):#I2M use the coordinates, so 2D coformation should be always
	#only use longest for desalts, one molecule comparing
	test_smiles= select_longest_smiles(test_smiles)
	original_smiles= select_longest_smiles(original_smiles)
	test_smiles, warnings=check_and_fix_valence(test_smiles)

	original_smiles = remove_backslash_and_slash(original_smiles)#c/s
	test_smiles = remove_backslash_and_slash(test_smiles)
	original_smiles = re.sub(r'\[(\d+)\', '[',original_smiles)#[1]-->[]
	test_smiles = re.sub(r'\[(\d+)\', '[',test_smiles)
	original_smiles = remove_SP(original_smiles)#additional complex space stero from coordinates, most not used
	test_smiles = remove_SP(test_smiles)

	rd_smi_ori, success1=rdkit_canonicalize_smiles(original_smiles)#R-->*
	if "S" in rd_smi_ori and success1:#NOTE H replace radical electron
	rd_smi_ori=NoRadical_Smi(rd_smi_ori)
	rd_smi, success2=rdkit_canonicalize_smiles(test_smiles)
	original_smiles,test_smiles=rd_smi_ori,rd_smi

	mol1 = Chem.MolFromSmiles(original_smiles)#TODO considering smiles with rdkit not recongized in real data
	mol2 = Chem.MolFromSmiles(test_smiles)#TODO considering smiles with rdkit not recongized in real data

	morganfps1 = AllChem.GetMorganFingerprint(mol1, useChirality=False)
	morganfps2 = AllChem.GetMorganFingerprint(mol2, useChirality=False)
	morgan_tani = DataStructs.DiceSimilarity(morganfps1, morganfps2)
	fp1 = Chem.RDKFingerprint(mol1)
	fp2 = Chem.RDKFingerprint(mol2)
	tanimoto = DataStructs.FingerprintSimilarity(fp1, fp2)
	return morgan_tani, tanimoto




	def comparing_smiles2(original_smiles,test_smiles):#I2M use the coordinates, so 2D coformation should be always
	original_smiles = remove_backslash_and_slash(original_smiles)#c/s
	test_smiles = remove_backslash_and_slash(test_smiles)
	original_smiles = re.sub(r'\[(\d+)\', '[',original_smiles)#[1]-->[]
	test_smiles = re.sub(r'\[(\d+)\', '[',test_smiles)
	original_smiles = remove_SP(original_smiles)#additional complex space stero from coordinates, most not used
	test_smiles = remove_SP(test_smiles)

	rd_smi_ori, success1=rdkit_canonicalize_smiles(original_smiles)#R-->*
	if "S" in rd_smi_ori and success1:#NOTE H replace radical electron
	rd_smi_ori=NoRadical_Smi(rd_smi_ori)

	rd_smi, success2=rdkit_canonicalize_smiles(test_smiles)
	original_smiles,test_smiles=rd_smi_ori,rd_smi

	try:
	original_mol = Chem.MolFromSmiles(original_smiles)#considering whe nmmet abbrev
	test_mol = Chem.MolFromSmiles(test_smiles,sanitize=False)#as build mol may not sanitized for rdkit
	if success2 and success1:
	# if original_smiles!=test_smiles:
	# print(f'smiles ori,pred after Chem.CanonSmiles:\n{original_smiles}\n{test_smiles}')
	RDarom_smi=Chem.MolToSmiles(original_mol)
	RDarom_smi_test=Chem.MolToSmiles(test_mol)
	if RDarom_smi==RDarom_smi_test:
	return True
	else:
	print(f'smiles ori,pred after Chem.CanonSmiles:\n{RDarom_smi}\n{RDarom_smi_test}\n')

	if original_mol:
	Chem.SanitizeMol(original_mol)
	keku_smi_ori=Chem.MolToSmiles(original_mol,kekuleSmiles=True)
	else:
	keku_smi_ori=original_smiles

	if test_mol:
	Chem.SanitizeMol(test_mol)
	keku_smi=Chem.MolToSmiles(test_mol,kekuleSmiles=True)
	else:
	keku_smi=test_smiles

	if '*' not in keku_smi:
	keku_inch_ori= Chem.MolToInchi(Chem.MolFromSmiles(keku_smi_ori))
	keku_inch_test= Chem.MolToInchi(Chem.MolFromSmiles(keku_smi))
	else:
	keku_inch_ori= 1
	keku_inch_test= 2

	rd_smi=Chem.MolToSmiles(test_mol)#need improve the acc
	rd_smi_ori=Chem.MolToSmiles(original_mol)
	except Exception as e:#TODO fixme here
	print(f"comparing_smiles@@@ kekulize or SanitizeMol problems")# original_smiles,test_smiles\n{original_smiles}\n{test_smiles}")
	print(e,"!!!!!!!\n")
	keku_inch_ori= 1
	keku_inch_test= 2
	keku_smi=1
	keku_smi_ori=2
	#add molscribe rules here
	if not success1:#ori smiles still invaild even after * replaced
	rd_smi_ori = rd_smi
	# else:
	# if canon_smiles1 == canon_smiles2:
	# rd_smi_ori = rd_smi
	# else:
	if rd_smi_ori == rd_smi or keku_smi_ori == keku_smi or keku_inch_ori==keku_inch_test :#as orinial smiles may use kekuleSmiles style
	return True
	else:return False

	def smiles12_comparing(original_smiles,test_smiles):
	original_smiles = remove_backslash_and_slash(original_smiles)#c/s
	test_smiles = remove_backslash_and_slash(test_smiles)
	original_smiles = re.sub(r'\[(\d+)\', '[',original_smiles)#[1]-->[]
	test_smiles = re.sub(r'\[(\d+)\', '[',test_smiles)
	original_smiles = remove_SP(original_smiles)#additional complex space stero from coordinates, most not used
	test_smiles = remove_SP(test_smiles)

	rd_smi_ori, success1=rdkit_canonicalize_smiles(original_smiles)
	rd_smi, success2=rdkit_canonicalize_smiles(test_smiles)
	original_smiles,test_smiles=rd_smi_ori,rd_smi
	try:
	original_mol = Chem.MolFromSmiles(original_smiles)#considering whe nmmet abbrev
	test_mol = Chem.MolFromSmiles(test_smiles,sanitize=False)#as build mol may not sanitized for rdkit
	if original_mol:
	Chem.SanitizeMol(original_mol)
	keku_smi_ori=Chem.MolToSmiles(original_mol,kekuleSmiles=True)
	else:
	keku_smi_ori=original_smiles

	if test_mol:
	Chem.SanitizeMol(test_mol)
	keku_smi=Chem.MolToSmiles(test_mol,kekuleSmiles=True)
	else:
	keku_smi=test_smiles

	if '*' not in keku_smi:
	keku_inch_ori= Chem.MolToInchi(Chem.MolFromSmiles(keku_smi_ori))
	keku_inch_test= Chem.MolToInchi(Chem.MolFromSmiles(keku_smi))
	else:
	keku_inch_ori= 1
	keku_inch_test= 2

	rd_smi=Chem.MolToSmiles(test_mol)#need improve the acc
	rd_smi_ori=Chem.MolToSmiles(original_mol)
	except Exception as e:#TODO fixme here
	print(f"comparing_smiles@@@ kekulize or SanitizeMol problems")# original_smiles,test_smiles\n{original_smiles}\n{test_smiles}")
	print(e,"!!!!!!!\n")
	keku_inch_ori= 1
	keku_inch_test= 2
	keku_smi=1
	keku_smi_ori=2
	#add molscribe rules here
	if not success1:#ori smiles still invaild even after * replaced
	rd_smi_ori = rd_smi
	# else:
	# if canon_smiles1 == canon_smiles2:
	# rd_smi_ori = rd_smi
	# else:
	if rd_smi_ori == rd_smi or keku_smi_ori == keku_smi or keku_inch_ori==keku_inch_test :#as orinial smiles may use kekuleSmiles style
	return True
	else:return False


	def comparing_smiles(new_row,test_smiles):#I2M use the coordinates, so 2D coformation should be always
	original_smiles=new_row['SMILESori']
	original_smiles = remove_backslash_and_slash(original_smiles)#c/s
	test_smiles = remove_backslash_and_slash(test_smiles)
	original_smiles = re.sub(r'\[(\d+)\', '[',original_smiles)#[1]-->[]
	test_smiles = re.sub(r'\[(\d+)\', '[',test_smiles)
	original_smiles = remove_SP(original_smiles)#additional complex space stero from coordinates, most not used
	test_smiles = remove_SP(test_smiles)

	rd_smi_ori, success1=rdkit_canonicalize_smiles(original_smiles)
	rd_smi, success2=rdkit_canonicalize_smiles(test_smiles)
	original_smiles,test_smiles=rd_smi_ori,rd_smi
	try:
	original_mol = Chem.MolFromSmiles(original_smiles)#considering whe nmmet abbrev
	test_mol = Chem.MolFromSmiles(test_smiles,sanitize=False)#as build mol may not sanitized for rdkit
	if original_mol:
	Chem.SanitizeMol(original_mol)
	keku_smi_ori=Chem.MolToSmiles(original_mol,kekuleSmiles=True)
	else:
	keku_smi_ori=original_smiles

	if test_mol:
	Chem.SanitizeMol(test_mol)
	keku_smi=Chem.MolToSmiles(test_mol,kekuleSmiles=True)
	else:
	keku_smi=test_smiles

	if '*' not in keku_smi:
	keku_inch_ori= Chem.MolToInchi(Chem.MolFromSmiles(keku_smi_ori))
	keku_inch_test= Chem.MolToInchi(Chem.MolFromSmiles(keku_smi))
	else:
	keku_inch_ori= 1
	keku_inch_test= 2

	rd_smi=Chem.MolToSmiles(test_mol)#need improve the acc
	rd_smi_ori=Chem.MolToSmiles(original_mol)
	except Exception as e:#TODO fixme here
	print(f"comparing_smiles@@@ kekulize or SanitizeMol problems")# original_smiles,test_smiles\n{original_smiles}\n{test_smiles}")
	print(new_row)
	print(e,"!!!!!!!\n")
	keku_inch_ori= 1
	keku_inch_test= 2
	keku_smi=1
	keku_smi_ori=2
	#add molscribe rules here
	if not success1:#ori smiles still invaild even after * replaced
	rd_smi_ori = rd_smi
	# else:
	# if canon_smiles1 == canon_smiles2:
	# rd_smi_ori = rd_smi
	# else:
	if rd_smi_ori == rd_smi or keku_smi_ori == keku_smi or keku_inch_ori==keku_inch_test :#as orinial smiles may use kekuleSmiles style
	return True
	else:return False







	def bbox2center(bbox):
	x_center = (bbox[:, 0] + bbox[:, 2]) / 2
	y_center = (bbox[:, 1] + bbox[:, 3]) / 2
	# center_coords = torch.stack((x_center, y_center), dim=1)
	centers = np.stack((x_center, y_center), axis=1)
	return centers

	import cv2
	BONDDIRECT=['ENDUPRIGHT', 'BEGINWEDGE', 'BEGINDASH', 'ENDDOWNRIGHT']


	def reorder_bond_bbox(bond_bbox, single_atom_bond):
	# 分离普通索引和需要后置的索引
	normal_indices = []
	special_indices = []
	# 获取需要后置的 key
	keys_to_move = set(single_atom_bond.keys())
	# 分类所有索引
	for i in range(len(bond_bbox)):
	if i in keys_to_move:
	special_indices.append(i)
	else:
	normal_indices.append(i)
	# 新顺序：普通索引在前，特殊索引在后
	new_order = normal_indices + special_indices
	# 重排 bond_bbox
	reordered_bbox = [bond_bbox[i] for i in new_order]
	return reordered_bbox

	def boxes_overlap(box1, box2):
	"""
	检查两个边界框是否重叠
	box1, box2: [x1, y1, x2, y2]
	"""
	return not (box1[2] < box2[0] or box1[0] > box2[2] or
	box1[3] < box2[1] or box1[1] > box2[3])
	def calculate_center(box):
	"""
	计算边界框的中心点
	"""
	return np.array([(box[0] + box[2]) / 2, (box[1] + box[3]) / 2])
	def merge_boxes(box1, box2):
	"""
	合并两个边界框，返回新边界框 [x1, y1, x2, y2]
	"""
	return [
	min(box1[0], box2[0]),
	min(box1[1], box2[1]),
	max(box1[2], box2[2]),
	max(box1[3], box2[3])
	]


	def get_merged_box(boxes):
	"""Calculate the smallest box encompassing all given boxes."""
	x_mins = [box[0] for box in boxes]
	y_mins = [box[1] for box in boxes]
	x_maxs = [box[2] for box in boxes]
	y_maxs = [box[3] for box in boxes]
	return [min(x_mins), min(y_mins), max(x_maxs), max(y_maxs)]

	def box_area(box):
	"""Calculate the area of a box."""
	return (box[2] - box[0]) * (box[3] - box[1])

	def Newbox_(atom_bbox,bond_bbox, lab2idx):
	#add H atom box when on direction bond
	new_atoms=[]
	b_len=3
	single_odd_b2a=dict()
	for bi,bb in enumerate(bond_bbox):
	overlapped_atoms = []
	overlapped_abox=[]
	for ai,aa in enumerate(atom_bbox):
	overlap_flag=boxes_overlap(bb, aa)#TODO use tghe atom bond box overlap get bond atom mapping,then built mol
	if overlap_flag:
	# print(bb, aa,overlap_flag)
	overlapped_atoms.append(ai)
	overlapped_abox.append(aa)
	if len(overlapped_atoms) == 1:
	single_odd_b2a[bi]=overlapped_atoms
	# Compute the non-overlapping part of the bond box to place hydrogen
	non_overlapping_x,non_overlapping_y=boxes_overlap2(overlapped_abox[0], bb)
	new_atom_out={'bbox': np.array([non_overlapping_x - b_len,
	non_overlapping_y - b_len,
	non_overlapping_x + b_len,
	non_overlapping_y + b_len]).reshape(-1,4),
	'bbox_centers': np.array([non_overlapping_x,non_overlapping_y]).reshape(-1,2),
	'scores': np.array([1.0]),
	'pred_classes': np.array([lab2idx['H']])}
	new_atoms.append(new_atom_out)
	return new_atoms, single_odd_b2a


	def has_boxes(data):
	#TO CHECK OCR detct used or not
	return isinstance(data, list) and len(data) > 0 and all(
	isinstance(item, list) and len(item) == 2 and
	isinstance(item[0], list) and len(item[0]) == 4
	for item in data
	)

	def AtomBox2bondBox(atom_box,bond_bbox):
	b_nei=[]
	overlap=True
	for bi,bb in enumerate(bond_bbox):
	overlap_flag=boxes_overlap(bb, atom_box)#TODO use tghe atom bond box overlap get bond atom mapping,then built mol
	if overlap_flag:
	b_nei.append(bi)
	if len(b_nei)==0:
	# delt_hei.append(hei)
	overlap=False
	return overlap, b_nei


	import torchvision.transforms.v2 as T

	def image_to_tensor(image_path,debug=True):
	image = Image.open(image_path)
	w, h = image.size

	# 处理灰度或其他模式
	if image.mode == "L":
	if debug: print("检测到灰度图像 (1 通道)，转换为 RGB...")
	image = image.convert("RGB")
	elif image.mode != "RGB":
	if debug: print(f"检测到 {image.mode} 模式，转换为 RGB...")
	image = image.convert("RGB")
	# Define a transform to convert the image to a tensor and normalize it
	transform = T.Compose([
	T.Resize((640, 640)), # 调整大小
	# T.ToImageTensor(), # 转换为 PyTorch Tensor
	T.ToTensor(),
	lambda x: x.to(torch.float32), # 手动转换数据类型# T.ConvertDtype(dtype=torch.float32), # 转换数据类型
	])

	# Apply the transform to the image
	tensor = transform(image)

	return tensor,w,h



	# from src.zoo.rtdetr.rtdetr_postprocessor import RTDETRPostProcessor

	@torch.no_grad()
	def evaluate_x(model: torch.nn.Module, criterion: torch.nn.Module, postprocessors,
	data_loader, device,
	outcsv_filename=f'/home/jovyan/rt-detr/rt-detr/output/output_charge_CLEF.csv',
	visual_check=False,
	other2ppsocr=True,
	getacc=False,
	):

	postprocessor2=RTDETRPostProcessor(num_classes=30, use_focal_loss=True, num_top_queries=300, remap_mscoco_category=False)
	output_directory = os.path.dirname(outcsv_filename)
	prefix_f = os.path.basename(outcsv_filename).split('.')[0]
	if other2ppsocr:
	ocr = PaddleOCR(
	use_angle_cls=True,
	lang='latin',use_space_char=True,use_debug=False,
	use_gpu=True if cv2.cuda.getCudaEnabledDeviceCount() > 0 else False)

	ocr2 = ocr2 = PaddleOCR(use_angle_cls=True,use_gpu =False,use_debug=False,
	rec_algorithm='SVTR_LCNet', rec_model_dir='/nfs_home/bowen/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer',
	lang="en")
	outcsv_filename=f"{output_directory}/{prefix_f}_withOCR.csv"


	if visual_check:
	output_directory = os.path.dirname(outcsv_filename)
	prefix_f = os.path.basename(outcsv_filename).split('.')[0]
	ima_checkdir=f"{output_directory}/{prefix_f}_Boxed"
	os.makedirs(ima_checkdir, exist_ok=True)

	if getacc:
	acc_summary=f"{outcsv_filename}.I2Msummary.txt"
	flogout = open(f'{acc_summary}' , 'w')
	failed=[]
	mydiff=[]
	simRD=0
	sim=0
	mysum=0

	model.eval()
	criterion.eval()
	metric_logger = MetricLogger(delimiter=" ")
	header = 'Infering:'
	res_smiles = []
	idx_to_labels23={0:'other',1:'C',2:'O',3:'N',4:'Cl',5:'Br',6:'S',7:'F',8:'B',
	9:'I',10:'P',11:'*',12:'Si',13:'NONE',14:'BEGINWEDGE',15:'BEGINDASH',
	16:'=',17:'#',18:'-4',19:'-2',20:'-1',21:'1',22:'2',}
	idx_to_labels30 = {0:'other',1:'C',2:'O',3:'N',4:'Cl',5:'Br',6:'S',7:'F',8:'B',
	9:'I',10:'P',11:'H',12:'Si',13:'NONE',14:'BEGINWEDGE',15:'BEGINDASH',
	16:'=',17:'#',18:'-4',19:'-2',20:'-1',21:'1',22:'2',
	23:'CF3',#NOTE rdkit get element not supporting group
	24:'CN',
	25:'Me',
	26:'CO2Et',
	27:'R',
	28:'Ph',
	29:'*',
	}
	bond_labels = [13,14,15,16,17]

	if postprocessors.num_classes==23:
	# print(data["categories"])
	print(f'usage idx_to_labels23',idx_to_labels23)
	idx_to_labels = idx_to_labels23
	elif postprocessors.num_classes==30:
	# print(data["categories"])#NOTE 11 is H not * now
	print(f'usage idx_to_labels30',idx_to_labels30)
	idx_to_labels = idx_to_labels30
	else:
	print(f"error unkown ways@@@@@@@@@@@!!!!!!!!!!idx_to_labels::{len(idx_to_labels)}\n{idx_to_labels}")
	abrevie={"[23*]":'CF3',
	"[24*]":'CN',
	"[25*]":'Me',
	"[26*]":'CO2Et',
	"[27*]":'R',
	"[28*]":'Ph',
	"[29*]":'3~7UP',
	}
	# idx_to_labels = idx_to_labels23
	lab2idx={ v:k for k,v in idx_to_labels.items() }

	smiles_data = pd.DataFrame({'file_name': [],
	'SMILESori':[],
	'SMILESpre':[],
	'SMILESexp':[],
	}
	)
	output_dict = {}
	output_ori={}
	filtered_output_dict = {}
	box_thresh=0.1
	# for samples, targets in metric_logger.log_every(data_loader, 10, header):
	# samples = samples.to(device)
	# # targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
	# outputs = model(samples)
	# # orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0)#.to(device)
	# orig_target_sizes = targets["orig_size"].to(device)
	# results = postprocessors(outputs, orig_target_sizes)#RTDETRPostProcessor@@src/zoo/rtertr
	# for i_, z in enumerate(zip(targets['image_id'], results)):
	# ti, output=z
	# output_dict[ti.item()] = [
	# output,
	# targets['img_path'][i_],
	# targets['SMILES'][i_],
	# ]

	# output_ori[ti.item()] =[
	# targets['img_path'][i_],
	# targets['SMILES'][i_],
	# ]
	# print(len(output_ori),len(output_dict))
	for samples, targets in metric_logger.log_every(data_loader, 10, header):
	# orig_target_sizes = targets["orig_size"].to(device)
	for i_, ti in enumerate(targets['image_id']):
	output_dict[ti.item()] = [
	targets['img_path'][i_],
	targets['SMILES'][i_],
	]


	for key, value in output_dict.items():

	image_path = value[0]
	SMILESori = value[1]

	# selected_indices = value['scores'] > 0.5#may be >=0.5 cut off, as used the sigmoid?
	# selected_indices = value[0]['scores'] > box_thresh
	# true_count = selected_indices.sum().item()
	#testing here
	image_path='/cadd_data/samba_share/from_docker/data/work_space/ori/real/acs/ol020229e-Scheme-c3-10.png'

	tensor,w,h = image_to_tensor(image_path)
	tensor=tensor.unsqueeze(0).to(device)
	print(tensor.size()) # Output tensor shape (C x H x W)
	ori_size=torch.Tensor([w,h]).long().unsqueeze(0).to(device)
	outputs = model(tensor)
	result_ = postprocessor2(outputs, ori_size)
	# result_ = postprocessors(outputs, ori_size)
	score_=result_[0]['scores']
	boxe_=result_[0]['boxes']
	label_=result_[0]['labels']
	#---------------------------------################################
	selected_indices =score_ > box_thresh
	true_count = selected_indices.sum().item()
	output={
	'labels': label_[selected_indices].to("cpu").numpy(),
	'boxes': boxe_[selected_indices].to("cpu").numpy(),
	'scores': score_[selected_indices].to("cpu").numpy()
	}

	img_ori = Image.open(image_path).convert('RGB')
	w_ori, h_ori = img_ori.size # 获取原始图像的尺寸
	print(w_ori, h_ori, "orignianl vs 1000,1000")

	print(f"selected_indices 中 True 的数量: {true_count}")
	print(f"before nms_per_class, :: box 的数量:{len(output['labels'])}")
	output = nms_per_class(output['labels'], output['boxes'], output['scores'], iou_thresh=0.5)
	print(f"after nms_per_class, :: box 的数量:{len(output['labels'])}")


	# filtered_output_dict={image_path: output}
	x_center = (output["boxes"][:, 0] + output["boxes"][:, 2]) / 2
	y_center = (output["boxes"][:, 1] + output["boxes"][:, 3]) / 2
	# center_coords = torch.stack((x_center, y_center), dim=1)
	center_coords = np.stack((x_center, y_center), axis=1)
	# center_coords=np.stack((x_center, y_center)).reshape(-1,2)#NOTE not do this, mix element order shits
	#TODO split atom_charge \ bond drawing
	output = {'bbox': output["boxes"],#.to("cpu").numpy(),
	'bbox_centers': center_coords,#.to("cpu").numpy(),
	'scores': output["scores"],#.to("cpu").numpy(),
	'pred_classes': output["labels"],#.to("cpu").numpy()
	}
	############################################################################################################################
	img_ori = Image.open(image_path).convert('RGB')
	w_ori, h_ori = img_ori.size # 获取原始图像的尺寸
	print(w_ori, h_ori, "orignianl vs 1000,1000")
	# 计算缩放比例
	scale_x = 1000 / w_ori
	scale_y = 1000 / h_ori
	img_ori_1k = img_ori.resize((1000,1000))
	img = Image.open(image_path).convert('RGB')
	img = img.resize((1000,1000))
	# atom_bondBox_check=True

	print(f"from oupt socore > {box_thresh},get box {len(output['bbox'])} after nms_per_class ")
	# split into atom bond charge nms， then mergd , then box2 mol NOTE charege and bond confidence at least >10%
	charge_mask = np.array([True if ins in charge_labels and output['scores'][i]>0.1 else False for i, ins in enumerate(output['pred_classes'])])
	charges_bbox=output['bbox'][charge_mask]
	charges_centers= output['bbox_centers'][charge_mask]
	charges_classes= output['pred_classes'][charge_mask]
	charges_scores= output['scores'][charge_mask]
	charges_bbox, charges_centers, charges_scores,charges_classes,figc =view_box_center2(charges_bbox, charges_centers, charges_scores,charges_classes, overlap_dist_thresh=5.0, max_centers_per_box=5)
	#view_box_center2 help remove large box if boxscore small than 0.5
	# bonds_mask2 = np.array([True if ins in bond_labels else False for ins in output['pred_classes']])
	# bonds_mask= output['scores'][bonds_mask2]>=0.1# TODO fix me, as training bond box overlap with bondbox,aussme bond socre make sense
	bonds_mask = np.array([True if ins in bond_labels and output['scores'][i]>0.2 else False for i, ins in enumerate(output['pred_classes'])])
	bond_bbox=output['bbox'][bonds_mask]
	bond_centers= output['bbox_centers'][bonds_mask]
	bond_classes= output['pred_classes'][bonds_mask]
	bond_scores= output['scores'][bonds_mask]
	# bond_bbox2, bond_centers2, bond_scores2,bond_classes2,fig=view_box_center2(bond_bbox, bond_centers, bond_scores,bond_classes, overlap_dist_thresh=5.0, max_centers_per_box=5)
	bond_bbox, bond_centers, bond_scores,bond_classes,fig =view_box_center2(bond_bbox, bond_centers, bond_scores,bond_classes, overlap_dist_thresh=5.0, max_centers_per_box=3)
	bond_bbox, bond_classes, bond_scores = nms(bond_bbox, bond_scores,bond_classes, iou_threshold=0.5)

	heavy_mask= np.array([True if ins not in bond_labels and ins not in charge_labels and ins != lab2idx['H'] else False for ins in output['pred_classes']])
	h_mask= np.array([True if ins not in bond_labels and ins not in charge_labels and ins == lab2idx['H'] else False for ins in output['pred_classes']])

	#TODO fix me if heavy or H all need this view_box_center2 filtering
	heavy_bbox = output['bbox'][heavy_mask]
	heavy_classes = output['pred_classes'][heavy_mask]
	heavy_centers= output['bbox_centers'][heavy_mask]
	heavy_scores= output['scores'][heavy_mask]
	heavy_bbox, heavy_centers, heavy_scores,heavy_classes,fighv =view_box_center2(heavy_bbox, heavy_centers, heavy_scores,heavy_classes, overlap_dist_thresh=5.0, max_centers_per_box=5)

	#TODO del isolated C without bond box overlap
	delt_hei=[]
	for hei,hebox in enumerate(heavy_bbox):
	he_class=idx_to_labels[heavy_classes[hei]]
	b_nei=[]
	if he_class in ['C']:#TODO add other cases
	for bi,bb in enumerate(bond_bbox):
	overlap_flag=boxes_overlap(bb, hebox)#TODO use tghe atom bond box overlap get bond atom mapping,then built mol
	if overlap_flag:
	b_nei.append(bi)
	if len(b_nei)==0:
	delt_hei.append(hei)
	n = len(heavy_scores) # 更新数量
	keep_boxes = np.ones(n, dtype=bool)
	keep_boxes[delt_hei]=False
	heavy_bbox, heavy_centers, heavy_scores,heavy_classes=heavy_bbox[keep_boxes], heavy_centers[keep_boxes], heavy_scores[keep_boxes],heavy_classes[keep_boxes]

	h_bbox = output['bbox'][h_mask]
	h_centers= output['bbox_centers'][h_mask]
	h_classes= output['pred_classes'][h_mask]
	h_scores= output['scores'][h_mask]
	h_bbox, h_centers, h_scores,h_classes,figh =view_box_center2(h_bbox, h_centers, h_scores,h_classes, overlap_dist_thresh=5.0, max_centers_per_box=5)

	#NOTE need keep the order heavy atom first then following with Hs
	# atoms_mask = np.array([True if ins not in bond_labels and ins not in charge_labels else False for ins in output['pred_classes']])
	# atom_bbox=output['bbox'][atoms_mask]
	# atom_classes=output['pred_classes'][atoms_mask]
	# 合并 bbox，保持重原子在前，氢原子在后
	atom_bbox = np.concatenate([heavy_bbox, h_bbox], axis=0)
	atom_classes = np.concatenate([heavy_classes, h_classes], axis=0)
	# atom_centers = np.concatenate([heavy_centers, h_centers], axis=0)
	atom_scores = np.concatenate([heavy_scores, h_scores], axis=0)
	#TODO nms checking
	# kept_bboxes, kept_classes, kept_scores=nms(atom_bbox, atom_scores, atom_classes, iou_threshold=0.5)
	# # kept_bboxes, kept_classes, kept_scores=nms_atomBox(atom_bbox, atom_scores, atom_classes, iou_threshold=0.5)
	# merged_bboxes, merged_classes, merged_scores = merge_low_iou_boxes(kept_bboxes, kept_classes, kept_scores, merge_threshold=0.5, score_threshold=0.7)
	# print(f'ater nms kept_box {len(kept_bboxes)}, followd merge_low_iou_boxes kept_box:: {len(merged_bboxes)}')
	# atom_bbox, atom_classes, atom_scores=merged_bboxes, merged_classes, merged_scores
	atom_bbox, atom_scores, atom_classes = refine_boxes(atom_bbox, atom_scores, atom_classes, bond_bbox)


	x_center = (atom_bbox[:, 0] + atom_bbox[:, 2]) / 2
	y_center = (atom_bbox[:, 1] + atom_bbox[:, 3]) / 2
	# center_coords = torch.stack((x_center, y_center), dim=1)
	center_coords = np.stack((x_center, y_center), axis=1)
	atom_centers=center_coords

	print(f"before NMS :: heavy box {len(heavy_bbox)} ---- H box {len(h_bbox)}---bond box{len(bond_bbox)}")
	print(f"after NMS+view_box_center2 :: atom box {len(atom_bbox)} bond box {len(bond_bbox)} charge box {len(charges_bbox)} ")
	# print(f"bond box with only single atom box overlap:: {single_odd_bi}")
	print(f"atom box afte NMS and merge_low_iou_boxes")
	print(f"get box {len(output['bbox'])} with NMS")
	print(f"atom score >0.1 bond score >0.2, then folllowed with NMS")
	print(f"bond_bbox nums::",bond_bbox.shape,len(bond_bbox))
	print(f" OCR will start involved ")#
	#check if ODD single-bonds with only one atom exisits, try add the atoms box for this bond
	new_atoms, single_odd_b2a= Newbox_(atom_bbox,bond_bbox, lab2idx )
	print(f"new_atoms number {len(new_atoms)}\n{new_atoms}")
	if len(new_atoms)>0:
	for boxout in new_atoms:
	for k,arr in boxout.items():
	value_or_row=output[k]
	if arr.ndim == 1:
	output[k]=np.append(value_or_row, arr)
	elif arr.ndim >= 2:
	output[k] = np.concatenate([value_or_row, arr], axis=0)
	else:
	print('errprs, unkown conditions !!!@')
	#NOTE try to use OCR to help postprocess box adding and del
	# 加载图像 OCR
	image = cv2.imread(image_path)
	image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
	# 预处理图像突出下标
	gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
	_, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV)
	# print(_, thresh)
	kernel = np.ones((2, 2), np.uint8)
	dilated = cv2.dilate(thresh, kernel, iterations=1)
	# cv2.imwrite("preprocessed.jpg", dilated)#NOTE comment if need checking
	# result = ocr.ocr("preprocessed.jpg", cls=True)
	# ocr.ocr(image_npocr, cls=True, det=False)
	result = ocr.ocr(dilated, cls=True) # 直接传递 NumPy 数组
	# 解析结果
	text_boxes = []
	text_contents = []
	confidences = []
	for line in result:
	print(line)
	if line:
	for box_info in line:
	box = box_info[0]
	x_coords = [point[0] for point in box]
	y_coords = [point[1] for point in box]
	text_box = [min(x_coords), min(y_coords), max(x_coords), max(y_coords)]
	text = box_info[1][0]
	text_boxes.append(text_box)
	text_contents.append(text)
	confidences.append(box_info[1][1])
	print("Detected text boxes:", text_boxes)
	print("Detected text contents:", text_contents)
	print("Confidences:", confidences)
	#after whole img OCRed
	# Initialize dictionaries and lists
	ai2text = {}
	ai2relplace = {}
	ai2rdkitlab_unknown = {}
	non_overlapping_texts = []
	# Build initial KDTree
	tree = cKDTree(atom_centers)
	# Collect indices to delete after the loop to keep tree valid during processing
	indices_to_delete = set()
	# Process each OCR text box
	for ti, text_box in enumerate(text_boxes):
	text_center = calculate_center(text_box)
	ocr_text = text_contents[ti]

	# Normalize OCR text
	if ocr_text in ['OH', 'HO']:
	ocr_text = 'O'
	elif ocr_text in ['SH', 'HS']:
	ocr_text = 'S'
	elif ocr_text in ['NH', 'HN']:
	ocr_text = 'N'
	elif ocr_text in ['CH', 'HC']:
	ocr_text = 'C'
	elif ocr_text == '0':
	ocr_text = 'O'
	elif ocr_text == 'L':
	ocr_text = 'Li'
	elif ocr_text[-1]=='-':
	if ocr_text[:-1] in ABBREVIATIONS:
	ocr_text=ocr_text[:-1]

	# Find all overlapping atom boxes
	overlapping_indices = []
	for idx in range(len(atom_bbox)):
	if idx not in indices_to_delete and boxes_overlap(atom_bbox[idx], text_box):
	overlapping_indices.append(idx)

	if overlapping_indices:
	# If there are overlapping atom boxes, merge them
	if len(overlapping_indices) > 1:
	# Get the smallest box encompassing all overlapping atom boxes
	overlapping_boxes = [atom_bbox[idx] for idx in overlapping_indices]
	merged_box = get_merged_box(overlapping_boxes)
	overlapping_indices_atomboxclass=[idx_to_labels[atom_classes[i]] for i in overlapping_indices]
	print(f"Merging {len(overlapping_indices)} atom boxes overlapping with OCR text: {ocr_text}")
	print(f" {overlapping_indices} boxes type{overlapping_indices_atomboxclass} merged as OCR text: {ocr_text}")
	merged_area = box_area(merged_box)
	text_area = box_area(text_box)
	final_box = merged_box if merged_area >= text_area else text_box
	else:
	# If only one overlap, use the text box directly
	final_box = text_box
	# Use the OCR text box as the merged box
	primary_idx = overlapping_indices[0]
	# atom_bbox[primary_idx] = text_box

	# Update the primary atom box
	atom_bbox[primary_idx] = final_box
	# Update class and dictionaries based on OCR text
	if ocr_text in ABBREVIATIONS:
	ai2relplace[primary_idx] = ocr_text
	atom_classes[primary_idx] = 0
	if ocr_text in lab2idx:
	atom_classes[primary_idx] = lab2idx[ocr_text]
	elif ocr_text in ['H', 'C', 'O', 'N', 'Cl', 'Br', 'S', 'F', 'B', 'I', 'P', 'Si']:
	atom_classes[primary_idx] = lab2idx[ocr_text]
	elif ocr_text in RGROUP_SYMBOLS or (ocr_text[0] == 'R' and ocr_text[1:].isdigit()):
	atom_classes[primary_idx] = 0
	else:
	ai2rdkitlab_unknown[primary_idx] = ocr_text
	atom_classes[primary_idx] = 0

	ai2text[primary_idx] = ocr_text

	# Mark redundant indices for deletion
	indices_to_delete.update(overlapping_indices[1:])

	else:
	# No overlap: record the text box and nearest atom index
	distance, nearest_idx = tree.query(text_center)
	if nearest_idx not in indices_to_delete: # Only record if nearest_idx is still valid
	print(f"No overlap for OCR text '{ocr_text}', nearest atom box index: {nearest_idx}")
	non_overlapping_texts.append({
	'text': ocr_text,
	'text_box': text_box,
	'nearest_atom_idx': nearest_idx,
	'distance': distance
	})

	#set up atom_ocr match atom_class
	atom_ocr=[]
	for i,ai in enumerate(atom_classes):
	if i in ai2text:
	atom_ocr.append(ai2text[i])
	# elif i in ai2rdkitlab_unknown:
	# atom_ocr.append(ai2rdkitlab_unknown[i])
	else:
	atom_ocr.append(idx_to_labels[ai])
	print(f"atom class + ocr presented as symbols::\n{atom_ocr}")
	atom_ocr=np.array(atom_ocr)
	# Perform deletions after the loop
	if indices_to_delete:
	indices_to_keep = np.setdiff1d(np.arange(len(atom_bbox)), list(indices_to_delete))
	atom_bbox = atom_bbox[indices_to_keep]
	atom_classes = atom_classes[indices_to_keep]
	atom_centers = atom_centers[indices_to_keep]
	atom_scores = atom_scores[indices_to_keep]
	atom_ocr= atom_ocr[indices_to_keep]

	# Adjust dictionary indices
	for d in [ai2text, ai2relplace, ai2rdkitlab_unknown]:
	d_new = {}
	for old_idx, value in d.items():
	new_idx = np.where(indices_to_keep == old_idx)[0][0] if old_idx in indices_to_keep else None
	if new_idx is not None:
	d_new[new_idx] = value
	d.clear()
	d.update(d_new)

	# Adjust nearest_atom_idx in non_overlapping_texts
	for entry in non_overlapping_texts:
	old_idx = entry['nearest_atom_idx']
	if old_idx in indices_to_keep:
	entry['nearest_atom_idx'] = np.where(indices_to_keep == old_idx)[0][0]
	else:
	entry['nearest_atom_idx'] = -1 # Mark as invalid if the nearest atom was deleted

	# Rebuild KDTree if needed for further use
	tree = cKDTree(atom_centers)

	# Final output
	print("Whole img with OCR :: ai2relplace, ai2rdkitlab_unknown:", [ai2relplace, ai2rdkitlab_unknown])
	print(f"Adjusted ai ocr_text: {ai2text}")
	print(f"Atom box num: {len(atom_bbox)}:: {[idx_to_labels[i] for i in atom_classes]}")
	print("Non-overlapping OCR text boxes:", non_overlapping_texts)

	#for all heavy atom labels, consider N3 pred as N, or other cases, I2M not good as paddle on ABC
	atomcorp_img = Image.open(image_path).convert('RGB')
	atomcorp_img1k=atomcorp_img.resize([1000,1000])
	text_contents_star=[]
	text_confidences_star=[]
	text_boxes_star=[]
	boxid2del=dict()
	ocr_discrepancies = {} # New dictionary to record OCR vs. AI mismatches
	print(f"has atom_bbox number {len(atom_bbox)}")
	for i,box in enumerate(atom_bbox):#split ocr image
	# if i in ai2text: continue #may be need comment this, if splited OCR acc better!!
	abox =box* [scale_x, scale_y, scale_x, scale_y]
	cropped_img=atomcorp_img1k.crop(abox)#if use the small ori image will not get infos
	image_npocr = np.array(cropped_img)
	result_ocr= ocr2.ocr(image_npocr, det=False)#,cls=True,use_debug=False, det=False)#det fale not box but get rcongized more
	# result_ocr= ocr.ocr(image_npocr, cls=True, det=False)#,cls=True, det=False)#det fale not box but get rcongized more
	if result_ocr:
	for line in result_ocr:
	# print(f"Atom box--- {i}, OCR result---: {line}")
	if line:
	box_flag=has_boxes(line)
	for box_info in line:
	# print(len(box_info))
	if not box_flag:
	text=box_info[0]
	#[^a-zA-Z0-9\\-\+] 表示匹配除了字母、数字、、- 和 + 之外的所有字符。
	text=re.sub(r'[^a-zA-Z0-9,\*\-\+]', '', text)#remove special chars
	score_=box_info[1]
	text_contents_star.append(text)
	text_confidences_star.append(score_)
	else:#when paddleOCRuse detection model get text box info
	box = box_info[0]
	x_coords = [point[0] for point in box]
	y_coords = [point[1] for point in box]
	text_box = [min(x_coords), min(y_coords), max(x_coords), max(y_coords)]
	text = box_info[1][0]
	text=re.sub(r'[^a-zA-Z0-9,\*\-\+]', '', text)#remove special chars
	text_boxes_star.append(text_box)
	text_contents_star.append(text)
	score_=box_info[1][1]
	text_confidences_star.append(score_)
	if i in ai2text:#ocr 全img vs split img
	# print(f'from whole img ocr atom box {i}----from whole img::{ai2text[i]}')
	if ai2text[i] != text:
	text=ai2text[i] if len(ai2text[i])>=len(text) else text
	print(f"Atom box {i}@@ OCR text: {text}, score: {score_}, AI class: {idx_to_labels[atom_classes[i]]}, AI score: {atom_scores[i]}")
	# Normalize OCR text
	if text in ['OH', 'HO']:
	text = 'O'
	elif text in ['SH', 'HS']:
	text = 'S'
	elif text in ['NH', 'HN']:
	text = 'N'
	elif text in ['CH', 'HC']:
	text = 'C'
	elif text == '0':
	text = 'O'
	elif text == 'L':
	text = 'Li'
	elif '-' in text:
	if text[:-1] in ABBREVIATIONS:
	text=text[:-1]

	# Check if OCR text is a single character and not a valid element
	is_single_char = len(text) == 1
	ai_pred = idx_to_labels[atom_classes[i]]
	#TOD add more simpfiled
	if text=='0':
	atom_classes[i]=lab2idx['O']
	elif text in ['H', 'C', 'O', 'N', 'Cl', 'Br', 'S', 'F', 'B', 'I', 'P', 'Si']:
	atom_classes[i]=lab2idx[text]#need update to keep H following Heavy
	# elif # ocr recongnized on lable C as other things chars
	elif is_single_char and text not in ELEMENTS and ai_pred == 'C':
	# Do not replace AI prediction, just record discrepancy
	ocr_discrepancies[i] = {
	'ocr_text': text,
	'ocr_score': score_,
	'ai_class': ai_pred,
	'ai_score': atom_scores[i]
	}
	else:
	overlap, b_nei=AtomBox2bondBox(atom_bbox[i],bond_bbox)
	if not overlap:
	if text not in ELEMENTS and text not in ABBREVIATIONS:
	# print(f"new cases::{text} for atombox {i} {atom_bbox[i]}check how to fix it !!!")
	# print(f'OCR text:: {text} score ::{box_info}\|\|atom clss::{idx_to_labels[atom_classes[i]]} {atom_scores[i]}')
	if text != idx_to_labels[atom_classes[i]]:
	boxid2del[i]= [text,idx_to_labels[atom_classes[i]]]#will delt this atom box infos
	else:
	if text != idx_to_labels[atom_classes[i]]:
	if atom_scores[i]<=score_:
	if text in RGROUP_SYMBOLS or text in ABBREVIATIONS:
	ai2relplace[i]=text
	atom_classes[i]=0
	if text in lab2idx and lab2idx[text] in list(range(23,29)):atom_classes[i]=lab2idx[text]
	elif text in ['H', 'C', 'O', 'N', 'Cl', 'Br', 'S', 'F', 'B', 'I', 'P', 'Si']:
	atom_classes[i]=lab2idx[text]
	else:
	ai2relplace[i]=text
	atom_classes[i]=0

	# 按照 value 的第一个元素（假设是字符串）的长度进行排序，长度大的排前
	boxid2del = dict(sorted(boxid2del.items(), key=lambda item: item[0], reverse=True))
	print(f"considering del box",boxid2del)
	print("after split img OCR:: ai2relplace,ai2rdkitlab_unknown",[ai2relplace,ai2rdkitlab_unknown])
	print(f"considering delet atomb box :{boxid2del}")
	syms=[]
	for i in range(len(atom_classes)):
	if i in ai2relplace: syms.append(ai2relplace[i])
	elif i in ai2rdkitlab_unknown:syms.append(ai2rdkitlab_unknown[i])
	else:
	syms.append(idx_to_labels[atom_classes[i]])
	print(f"atombox {atom_classes}:: number {len(atom_classes)}\n",[idx_to_labels[i] for i in atom_classes])
	print(f" {syms}")
	#chedck isolated box, if need add bond box between the isolated box or not
	isolated_ais = []
	# 第一步：构建 bond 到 atom 的映射，并计算 distance_threshold
	bond_distances = []
	singleAtomBond=dict()
	for bi, bb in enumerate(bond_bbox):
	overlapped_atoms = []
	overlapped_abox = []
	for ai, aa in enumerate(atom_bbox):
	overlap_flag = boxes_overlap(bb, aa)
	if overlap_flag:
	overlapped_atoms.append(ai)
	overlapped_abox.append(aa)
	# if bi not in b2a.keys():
	# b2a[bi] = [ai]
	# else:
	# b2a[bi].append(ai)
	if len(overlapped_atoms) == 2:
	center1 = calculate_center(atom_bbox[overlapped_atoms[0]])
	center2 = calculate_center(atom_bbox[overlapped_atoms[1]])
	distance = np.linalg.norm(center1 - center2)
	bond_distances.append(distance)
	# print(f"Bond {bi} connects atoms {overlapped_atoms}, distance: {distance:.2f}")
	elif len(overlapped_atoms) == 1:
	print(f"single bond - atom still exists for bond {bi}, need porcess this !!")
	if bi not in singleAtomBond:
	singleAtomBond[bi]=overlapped_atoms#considering use the add H box for solve TODO

	# 动态计算 distance_threshold
	distance_threshold = max(bond_distances) if bond_distances else 100.0 # 默认值 10 如果无 bond
	distance_threshold_min = min(bond_distances) if bond_distances else 100.0 # 默认值 10 如果无 bond
	print(f"Calculated distance_threshold center based: {distance_threshold:.2f}")

	# 第二步：构建 atom 到 bond 的映射，并检测孤立原子
	a2b=dict()
	for ai, aa in enumerate(atom_bbox):
	b_nei = []
	for bi, bb in enumerate(bond_bbox):
	overlap_flag = boxes_overlap(bb, aa)
	if overlap_flag:
	b_nei.append(bi)
	a2b[ai] = b_nei
	if a2b[ai] ==[]:
	if ai not in isolated_ais:
	isolated_ais.append(ai)

	isolated_ais=sorted(isolated_ais,reverse=True)#avoid delte atom with index errors
	print(f"isolated_ais atom box {isolated_ais}\n ", [idx_to_labels[i] for i in atom_classes[isolated_ais]])

	# 第三步：处理孤立原子，尝试合并或删除
	updated_atom_bbox = atom_bbox.copy()
	updated_atom_classes = atom_classes.copy()
	updated_atom_scores = atom_scores.copy()
	print(f"atom bbox num {len(atom_bbox)}")#ttt
	new_bond_bbox=[]
	deleted_ais=[]
	del4boxid2del=set()
	for isolated_ai in isolated_ais:
	isolated_box = atom_bbox[isolated_ai]
	isolated_center = calculate_center(isolated_box)
	nearest_distance = float('inf')
	nearest_ai = -1
	# 找到最近的非孤立原子
	for ai, aa in enumerate(atom_bbox):
	if ai not in isolated_ais and ai != isolated_ai:
	center = calculate_center(aa)
	distance = np.linalg.norm(isolated_center - center)
	if distance < nearest_distance:
	nearest_distance = distance
	nearest_ai = ai
	# 合并或删除逻辑
	if nearest_ai != -1:
	if nearest_distance<=distance_threshold_min or (nearest_distance <=distance_threshold and nearest_distance>=distance_threshold_min):#this the centers dist not bond length
	nearest_box = atom_bbox[nearest_ai]
	nearest_class = atom_classes[nearest_ai]
	nearest_center = calculate_center(nearest_box)
	if isolated_ai in boxid2del:
	textocr2del=boxid2del[isolated_ai][0]
	else:
	textocr2del=None
	#NOTE based ont the class and ovelap bond box to adjust
	overlap1,bondnei=AtomBox2bondBox(nearest_box,bond_bbox)
	if len(bondnei)==1:#could be add two other bond, add bond box
	# if textocr2del in [',', '+', '-'] or not any(c.isupper() for c in textocr2del):
	if textocr2del is not None and not any(c.isupper() for c in textocr2del):
	# del4boxid2del.add(isolated_ai)
	deleted_ais.append(isolated_ai)
	pass
	else:
	new_bc = (isolated_center + nearest_center)*0.5
	new_bondbox=np.array([new_bc[0] - nearest_distance*0.5,
	new_bc[1] - nearest_distance*0.5,
	new_bc[0] + nearest_distance*0.5,
	new_bc[1] + nearest_distance*0.5]
	)
	new_bond_bbox.append(new_bondbox.reshape(-1,4))
	print(f'add a new bond box new_bc for two atom boxes {isolated_ai} ---- {nearest_ai}::\n {idx_to_labels[atom_classes[isolated_ai]]} --- {idx_to_labels[atom_classes[nearest_ai]]}')
	else:#TODO fix me when get the case with >=2 bonds need add bond also
	try:
	new_box = merge_boxes(isolated_box, nearest_box)
	updated_atom_bbox[nearest_ai] = new_box
	chosed_score_ = max(atom_scores[isolated_ai], atom_scores[nearest_ai])
	updated_atom_scores[nearest_ai] = chosed_score_
	except Exception as e:
	print(f"file_name@: {image_path}\n SMILES in csv:\n{SMILESori}")
	print(e)
	print('nearest_ai ', nearest_ai)
	check2=True
	if check2:
	padding=5
	# box_thresh=0.3
	atombox_img=draw_objs(copy.deepcopy(img),
	atom_bbox* [scale_x, scale_y, scale_x, scale_y],
	atom_classes, atom_scores ,
	category_index=idx_to_labels,
	box_thresh=box_thresh,
	line_thickness=3,
	font='arial.ttf',
	font_size=10)
	bonbox_img=draw_objs(copy.deepcopy(img),
	bond_bbox* [scale_x, scale_y, scale_x, scale_y],
	bond_classes, bond_scores ,
	category_index=idx_to_labels,
	box_thresh=0.01,
	line_thickness=3,
	font='arial.ttf',
	font_size=10)
	# Get sizes of the individual images
	atom_width, atom_height = atombox_img.size
	bon_width, bon_height = bonbox_img.size
	combined_width = atom_width + bon_width + padding * 3
	combined_height = max(atom_height, bon_height) + padding * 2
	combined_img = Image.new('RGB', (combined_width, combined_height), color=(255, 255, 255)) # White background
	# Paste the images onto the new canvas
	combined_img.paste(atombox_img, (padding, padding)) # Top-left
	combined_img.paste(bonbox_img, (atom_width + padding * 2, padding))
	print(f"atom box afte NMS and merge_low_iou_boxes")
	combined_img.save(f"tttttttttttttttttttttttBoxed.png"
	)
	raise Exception("@debug this!!\n")

	if chosed_score_>=0.5:
	if chosed_score_==atom_scores[isolated_ai]:
	updated_atom_classes[nearest_ai] = 0 # mrege replaced with *
	# else:
	# updated_atom_classes[nearest_ai] = atom_classes[nearest_ai] # 保留较高 score 的类别
	updated_atom_bbox = np.delete(updated_atom_bbox, isolated_ai, axis=0)#after mreged need del it
	# updated_atom_bbox = np.delete(updated_atom_bbox, isolated_ai, axis=0)
	updated_atom_classes = np.delete(updated_atom_classes, isolated_ai)
	updated_atom_scores = np.delete(updated_atom_scores, isolated_ai)
	print(f"Merged atom {isolated_ai} into {nearest_ai}, new box: {new_box}")
	isolated_ais.remove(isolated_ai)
	deleted_ais.append(isolated_ai)
	# elif nearest_distance<=distance_threshold_min:#very close,mrege with nearest one
	elif atom_scores[isolated_ai] < 0.5:
	# 删除低分孤立原子
	updated_atom_bbox = np.delete(updated_atom_bbox, isolated_ai, axis=0)
	updated_atom_classes = np.delete(updated_atom_classes, isolated_ai)
	updated_atom_scores = np.delete(updated_atom_scores, isolated_ai)
	print(f"DELET isolated atom {isolated_ai} with score {atom_scores[isolated_ai]}")
	deleted_ais.append(isolated_ai)
	# 更新索引，因为数组维度变化
	isolated_ais = [i if i < isolated_ai else i - 1 for i in isolated_ais if i != isolated_ai]
	else:
	print(f"KEEP isolated atom {isolated_ai} with score {atom_scores[isolated_ai]} >= 0.5")


	else:
	if atom_scores[isolated_ai] < 0.5:
	updated_atom_bbox = np.delete(updated_atom_bbox, isolated_ai, axis=0)
	updated_atom_classes = np.delete(updated_atom_classes, isolated_ai)
	updated_atom_scores = np.delete(updated_atom_scores, isolated_ai)
	print(f"DELET isolated atom {isolated_ai} with score {atom_scores[isolated_ai]}")
	deleted_ais.append(isolated_ai)
	isolated_ais = [i if i < isolated_ai else i - 1 for i in isolated_ais if i != isolated_ai]
	else:
	print(f"KEEP isolated atom {isolated_ai} with score {atom_scores[isolated_ai]} >= 0.5")

	if len(new_bond_bbox)>0:
	for i,bond_box in enumerate(new_bond_bbox):
	bond_bbox= np.concatenate([bond_bbox,bond_box],axis=0)
	bond_scores= np.concatenate((bond_scores,np.array([0.9])),axis=0)
	bond_classes= np.concatenate([bond_classes,np.array([13])],axis=0)
	#reset bond center
	x_center = (bond_bbox[:, 0] + bond_bbox[:, 2]) / 2
	y_center = (bond_bbox[:, 1] + bond_bbox[:, 3]) / 2
	# center_coords = torch.stack((x_center, y_center), dim=1)
	center_coords = np.stack((x_center, y_center), axis=1)
	bond_centers=center_coords

	#del the additional atom box that not connected by bond box also mismatch other rules
	if len(deleted_ais) > 0: # 如果有需要删除的索引
	print(f"will delete atom box with idx :: {deleted_ais}")
	# 使用 np.delete 一次性删除所有指定的行
	atom_classes = np.delete(atom_classes, deleted_ais, axis=0)
	atom_scores = np.delete(atom_scores, deleted_ais, axis=0)
	atom_bbox = np.delete(atom_bbox, deleted_ais, axis=0)
	atom_ocr = np.delete(atom_ocr, deleted_ais, axis=0)

	# eles=[idx_to_labels[i] for i in atom_classes]
	# print(eles,len(eles))
	cur_atomSymbols=[idx_to_labels[i] for i in atom_classes]
	ocr_wholeImg=[]
	for i in atom_classes:
	if i in ai2relplace:
	ocr_wholeImg.append(ai2relplace[i])
	elif i in ai2rdkitlab_unknown:
	ocr_wholeImg.append(ai2rdkitlab_unknown[i])
	else:
	ocr_wholeImg.append(idx_to_labels[i])
	print("ai2relplace,ai2rdkitlab_unknown",ai2relplace,ai2rdkitlab_unknown)
	print("cur_atomSymbols:",cur_atomSymbols)
	print(" atomSymbolsOCR:",ocr_wholeImg)

	# 找到 'H' 的索引, H after Heavy
	h_indices = np.where(atom_classes == lab2idx['H'])[0]
	non_h_indices = np.where(atom_classes != lab2idx['H'])[0]
	# print(h_indices,non_h_indices)
	# 重新排序
	new_order = np.concatenate((non_h_indices, h_indices)).astype(np.int64)
	# newid2old_Hafter={ i:j for i,j in enumerate(new_order)}
	# old2newid_Hafter={ j:i for i,j in enumerate(new_order)}
	atom_classes = atom_classes[new_order]
	atom_bbox = atom_bbox[new_order]
	atom_scores = atom_scores[new_order]
	x_center = (atom_bbox[:, 0] + atom_bbox[:, 2]) / 2
	y_center = (atom_bbox[:, 1] + atom_bbox[:, 3]) / 2
	# center_coords = torch.stack((x_center, y_center), dim=1)
	center_coords = np.stack((x_center, y_center), axis=1)
	atom_centers=center_coords#TODO 记得把 abbve idx label same reoder or mapping then bond
	#bond box reoder like atom box, let the singleAtomBond later
	bond_bbox = reorder_bond_bbox(bond_bbox, singleAtomBond)
	bond_classes = reorder_bond_bbox(bond_classes, singleAtomBond)
	bond_scores = reorder_bond_bbox(bond_scores, singleAtomBond)
	bond_centers = reorder_bond_bbox(bond_centers, singleAtomBond)

	# 第二步：构建 atom 到 bond 的映射，并检测孤立原子
	a2b=dict()
	for ai, aa in enumerate(atom_bbox):
	b_nei = []
	for bi, bb in enumerate(bond_bbox):
	overlap_flag = boxes_overlap(bb, aa)
	if overlap_flag:
	b_nei.append(bi)
	a2b[ai] = b_nei
	if a2b[ai] ==[]:
	if ai not in isolated_ais:
	isolated_ais.append(ai)

	b2a=dict()
	for bi,bb in enumerate(bond_bbox):
	overlapped_atoms = []
	overlapped_abox=[]
	for ai,aa in enumerate(atom_bbox):
	overlap_flag=boxes_overlap(bb, aa)#TODO use tghe atom bond box overlap get bond atom mapping,then built mol
	if overlap_flag:
	# print(bb, aa,overlap_flag)
	overlapped_atoms.append(ai)
	overlapped_abox.append(aa)
	if bi not in b2a.keys():
	b2a[bi]=[ai]
	else:
	# vais=b2a[bi]
	b2a[bi].append(ai)
	if len(overlapped_atoms) == 1:
	print(f"single bond -atom still exists {overlapped_atoms}")

	#c2a a2c
	#charge atom idx maping
	if len(charges_classes) > 0:
	# print(charges_bbox,charges_classes,len(charges_classes))
	kdt = cKDTree(atom_centers)
	atid_list=list(range(len(atom_centers)))
	used_charge_indices=set()
	c2a=dict()
	for i, (x,y) in enumerate(charges_centers):
	overlapped_abox=[]
	cc=charges_bbox[i]
	for ai, aa in enumerate(atom_bbox):
	overlap_flag=boxes_overlap(cc, aa)
	ac_iou=calculate_iou(cc, aa)
	charge_=charges_classes[i]
	charge_score=charges_scores[i]
	if overlap_flag:
	if i in c2a:
	c2a[i].append(ai)
	else:
	c2a[i]=[ai]
	if ai not in atid_list:
	print(f"Warning: ai {ai} is out of range for atom_list.")
	continue # 跳过当前循环迭代
	# idx_to_labels[charges_classes[0]]
	a2c=dict()
	for ci,v in c2a.items():
	charge_=idx_to_labels[charges_classes[ci]]
	if len(v)==1:
	a2c[v[0]]=ci
	else:
	for ai in v:
	ats=idx_to_labels[atom_classes[ai]]
	if ats=='other':
	ats='*'
	if ats in ['F','Cl','I','Br','O'] and int(charge_)<0:
	a2c[ai]=ci
	elif ats in ['N','H','P'] and int(charge_)>0:
	a2c[ai]=ci
	else:
	print(f'unusuaal case charge {charge_} with atom {ats}!!')

	print(f"all a2b b2a a2c c2a done, start mol built")
	#finsh the update of box back to the output for retraining used
	output={
	'bbox': np.concatenate([atom_bbox, bond_bbox,charges_bbox], axis=0),
	'bbox_centers': np.concatenate([atom_centers, bond_centers,charges_centers],axis=0),
	'scores': np.concatenate([atom_scores, bond_scores, charges_scores],axis=0),
	'pred_classes': np.concatenate([atom_classes, bond_classes, charges_classes],axis=0),
	'image_path': image_path
	}
	# boxinfo
	boxinfor={
	'bbox': output['bbox'],
	'scores': output['scores'],#TODO use same vocabl ?
	'pred_classes': output['pred_classes'],#[ lab2idx[x] for x in output['pred_classes']],#changet it back to character
	'image_path': image_path
	}
	#split agin for buit mol
	charge_mask = np.array([True if ins in charge_labels else False for ins in output['pred_classes']])
	charges_bbox=output['bbox'][charge_mask]
	charges_centers=bbox2center(charges_bbox)
	# charges_centers= output['bbox_centers'][charge_mask]
	charges_classes= output['pred_classes'][charge_mask]
	charges_scores= output['scores'][charge_mask]
	charges_bbox, charges_centers, charges_scores,charges_classes,figc =view_box_center2(charges_bbox, charges_centers, charges_scores,charges_classes, overlap_dist_thresh=5.0, max_centers_per_box=5)
	#view_box_center2 help remove large box if boxscore small than 0.5
	# bonds_mask2 = np.array([True if ins in bond_labels else False for ins in output['pred_classes']])
	# bonds_mask= output['scores'][bonds_mask2]>=0.1# TODO fix me, as training bond box overlap with bondbox,aussme bond socre make sense
	bonds_mask = np.array([True if ins in bond_labels and output['scores'][i]>0.2 else False for i, ins in enumerate(output['pred_classes'])])
	bond_bbox=output['bbox'][bonds_mask]
	bond_centers=bbox2center(bond_bbox)
	# bond_centers= output['bbox_centers'][bonds_mask]
	bond_classes= output['pred_classes'][bonds_mask]
	bond_scores= output['scores'][bonds_mask]
	print(f"before view_box_center2 bond nums {len(bond_scores)}")
	# bond_bbox2, bond_centers2, bond_scores2,bond_classes2,fig=view_box_center2(bond_bbox, bond_centers, bond_scores,bond_classes, overlap_dist_thresh=5.0, max_centers_per_box=5)
	bond_bbox, bond_centers, bond_scores,bond_classes,fig =view_box_center2(bond_bbox, bond_centers, bond_scores,bond_classes, overlap_dist_thresh=5.0, max_centers_per_box=3)
	print(f"after view_box_center2 bond nums {len(bond_scores)}")

	heavy_mask= np.array([True if ins not in bond_labels and ins not in charge_labels and ins != lab2idx['H'] else False for ins in output['pred_classes']])
	h_mask= np.array([True if ins not in bond_labels and ins not in charge_labels and ins == lab2idx['H'] else False for ins in output['pred_classes']])

	#TODO fix me if heavy or H all need this view_box_center2 filtering
	heavy_bbox = output['bbox'][heavy_mask]
	# heavy_classes = output['pred_classes'][heavy_mask]
	heavy_centers=bbox2center(heavy_bbox)
	# heavy_centers= output['bbox_centers'][heavy_mask]
	heavy_scores= output['scores'][heavy_mask]
	heavy_classes = output['pred_classes'][heavy_mask]
	heavy_bbox, heavy_centers, heavy_scores,heavy_classes,fighv =view_box_center2(heavy_bbox, heavy_centers, heavy_scores,heavy_classes, overlap_dist_thresh=5.0, max_centers_per_box=5)
	###########################start build mol ##########################
	rwmol_ = Chem.RWMol()
	boxi2ai = {} # 预测索引 -> RDKit 索引
	placeholder_atoms=dict()
	J=0
	for i, (bbox, a) in enumerate(zip(atom_bboxes, atom_classes)):
	a2labl=False
	a=replace_cg_notation(a)
	# print(a,'atom box class label')
	if a in ['H', 'C', 'O', 'N', 'Cl', 'Br', 'S', 'F', 'B', 'I', 'P', 'Si']:# '*', I2M's defined atom types
	# if a=='H':continue#skip H fristly,only with heavy atom then addH
	ad = Chem.Atom(a)#TODO consider non chemical group and label for using
	#TODO add pd rdkit known elemetns here
	elif a in ELEMENTS:
	ad = Chem.Atom(a)
	elif a in ABBREVIATIONS :
	ad = Chem.Atom("*")
	placeholder_atoms[i] = a # 记录非标准原但有定义的官能团类型及其位置,
	a2labl=True

	else:
	if N_C_H_expand(a):
	ad = Chem.Atom("*")
	placeholder_atoms[i] = a # 记录非标准原但有定义的官能团类型及其位置,
	a2labl=True
	elif C_H_expand(a):
	ad = Chem.Atom("*")
	placeholder_atoms[i] = a # 记录非标准原但有定义的官能团类型及其位置,
	a2labl=True
	elif C_H_expand2(a):
	ad = Chem.Atom("*")
	placeholder_atoms[i] = a # 记录非标准原但有定义的官能团类型及其位置,
	a2labl=True
	elif formula_regex(a):
	ad = Chem.Atom("*")
	placeholder_atoms[i] = a # 记录非标准原但有定义的官能团类型及其位置,
	a2labl=True
	else:
	ad = Chem.Atom("*")
	if a not in ['*',"other"]:
	a2labl=True
	# placeholder_atoms[idx] = a
	# atom = Chem.Atom(symbol)
	rwmol_.AddAtom(ad)
	boxi2ai[J] = rwmol_.GetNumAtoms() - 1
	if a2labl: rwmol_.GetAtomWithIdx(J).SetProp("atomLabel", f"{a}")#mol set with label, mol_rebuild not
	J+=1

	# 使用 KDTree 构建重原子间的键（如果提供了 bond_bbox）
	if len(charges_classes) > 0:
	for k,v in a2c.items():
	fc=int(idx_to_labels[charges_classes[v]])
	rwmol_.GetAtomWithIdx(k).SetFormalCharge(fc)
	# print(f"mol with heavy atoms number {i+1}, max heavy atom id {i}")
	print(f"mol with atoms number {i+1}, max atom id {i}")
	print(f"mol with bond box number {len(bond_classes)}")
	print(f"placeholder_atoms@@ {placeholder_atoms}")

	#重原子 skeleton mol
	bonds=dict()
	existing_bonds = set()
	b2aa=dict()
	singleAtomBond=[]
	bondWithdirct=[]

	# tree_heavy = KDTree(heavy_centers)#TODO before add bond consdiering reodering bond ??
	tree_atom = KDTree(atom_centers)#TODO as atom bond are all reodered to kee H last
	if len(idx_to_labels)==30:
	_margin=0#ad this version bond dynamicaly changed
	for bi, (bbox, idx_) in enumerate(zip(bond_bbox, bond_classes)):#not work for cross-bond, longer bond, as the center of bond may be close to as atoms not it two atoms
	bond_type = idx_to_labels[idx_]
	if len(idx_to_labels)==23:
	if idx_to_labels[bond_type] in ['-','SINGLE', 'NONE', 'ENDUPRIGHT', 'BEGINWEDGE', 'BEGINDASH', 'ENDDOWNRIGHT']:
	_margin = 5
	else:
	_margin = 8
	anchor_positions = (bbox + [_margin, _margin, -_margin, -_margin]).reshape([2, -1])
	oposite_anchor_positions = anchor_positions.copy()
	oposite_anchor_positions[:, 1] = oposite_anchor_positions[:, 1][::-1]
	# Upper left, lower right, lower left, upper right
	# x1y1, x2y2, x1y2, x2y1 : dinuogl lines
	anchor_positions = np.concatenate([anchor_positions, oposite_anchor_positions])
	# print(f"anchor_positions {anchor_positions.shape}\n{anchor_positions}")
	dists, neighbours = tree_atom.query(anchor_positions, k=1)
	if np.argmin((dists[0] + dists[1], dists[2] + dists[3])) == 0:
	# visualize setup
	begin_idx, end_idx = neighbours[:2]
	else:
	# visualize setup
	begin_idx, end_idx = neighbours[2:]
	atom1_idx = boxi2ai[begin_idx]
	atom2_idx = boxi2ai[end_idx]
	if atom1_idx == atom2_idx:#NOTE when bond with only one terminal atom, other side H not used
	print(f"attempt to add self-bond:{bi} atomIdx1 == atomIdx2 ::{[atom1_idx, atom2_idx]}")
	print(f"for bond bi {bi} H atom may involbed dists:",dists)
	print(neighbours)
	print("anchor_positions",anchor_positions)
	else:
	if bond_type in ['-', 'NONE', 'ENDUPRIGHT', 'BEGINWEDGE', 'BEGINDASH', 'ENDDOWNRIGHT']:
	if bond_type in BONDDIRECT:
	bonds[bi] = (atom1_idx, atom2_idx, 'SINGLE', bond_type)
	bondWithdirct.append(bi)
	else:
	bonds[bi] = (atom1_idx, atom2_idx, 'SINGLE', None)
	bond_type=BONDTYPE['SINGLE']
	elif bond_type == '=':
	bonds[bi] = (atom1_idx, atom2_idx, 'DOUBLE', None)
	bond_type=BONDTYPE['DOUBLE']
	elif bond_type == '#':
	bonds[bi] = (atom1_idx, atom2_idx, 'TRIPLE', None)
	bond_type=BONDTYPE['TRIPLE']
	else:
	print(f'unkown bond type relaced with single@@ {bond_type}')
	bonds[bi] = (atom1_idx, atom2_idx, 'SINGLE', None)
	bond_type=BONDTYPE['SINGLE']
	# 检查价态
	atom1 = rwmol_.GetAtomWithIdx(atom1_idx)
	atom2 = rwmol_.GetAtomWithIdx(atom2_idx)
	val1 = sum(b.GetBondTypeAsDouble() for b in atom1.GetBonds())
	val2 = sum(b.GetBondTypeAsDouble() for b in atom2.GetBonds())
	max_val1 = max(VALENCES[atom1.GetSymbol()])
	max_val2 = max(VALENCES[atom2.GetSymbol()])
	# bond_order = bond_type.AsDouble()
	bond_order=BONDTYPE2ORD[bond_type]
	if val1 + bond_order <= max_val1 and val2 + bond_order <= max_val2:
	bond1 = rwmol_.GetBondBetweenAtoms(atom1_idx, atom2_idx)
	bond2 = rwmol_.GetBondBetweenAtoms(atom2_idx, atom1_idx)
	if bond1 or bond2:
	# print(f'bond exists for {[atom1_idx, atom2_idx]}')
	pass
	# if (atom1_idx, atom2_idx) not in existing_bonds and (atom2_idx, atom1_idx) not in existing_bonds:
	else:
	# print(atom1_idx, atom2_idx, bond_type,[ bi, idx_to_labels[idx_] ])
	rwmol_.AddBond(atom1_idx, atom2_idx, bond_type)
	else:
	print(f"Skipping bond {bi}: Exceeds valence.")
	existing_bonds.add((atom1_idx, atom2_idx))
	b2aa[bi]=sorted([atom1_idx, atom2_idx])

	if len(bond_bbox)==1 and len(atom_bbox)==2:
	ca1='[:0][C:2]#[C:3][:1]'#acs phC#CpH
	rwmol_ = Chem.RWMol()
	ats= ['','','C','C']
	for ia in ats:
	a=Chem.Atom(ia)
	id_=rwmol_.AddAtom(a)
	# print(ia,id_)
	rwmol_.AddBond(2, 3, Chem.BondType.TRIPLE)
	rwmol_.AddBond(0, 2, Chem.BondType.SINGLE)
	rwmol_.AddBond(1, 3, Chem.BondType.SINGLE)

	# Chem.MolFromSmiles(ca1)
	for i in range(len(atom_classes)):
	atom_classes[i]=lab2idx['*']
	AllChem.Compute2DCoords(rwmol_)
	else:
	rwmol_=copy.deepcopy(rwmol_)
	print(f"placeholder_atoms {placeholder_atoms}")

	#assign 2D coords
	mol = rwmol_.GetMol()
	mol.RemoveAllConformers()
	conf = Chem.Conformer(mol.GetNumAtoms())
	# conf.Set3D(True)
	# for i, (x, y) in enumerate(heavy_centers):
	for i, (x, y) in enumerate(atom_centers):
	x, y=float(x),float(y)
	conf.SetAtomPosition(i, (x, y, 0))#TODO why some time need -y, just display same as ori?
	mol.AddConformer(conf)
	# Chem.SanitizeMol(mol)
	Chem.AssignStereochemistryFrom3D(mol)
	rwmol_=Chem.RWMol(mol)
	#as afte H a\lso didthis
	skeleton_mol=copy.deepcopy(rwmol_)
	print(skeleton_mol.GetNumBonds())
	chiral_centers_aids = Chem.FindMolChiralCenters(mol, includeUnassigned=True)

	# H realted post-process
	heavyNumber=len(heavy_centers)
	print(f'mol with heavy number atoms {heavyNumber}, max id {heavyNumber-1}')
	onlyHeayMol=copy.deepcopy(rwmol_)
	chiral_centers = Chem.FindMolChiralCenters(
	rwmol_, includeUnassigned=True, includeCIP=False, useLegacyImplementation=False)
	chiral_center_ids = [idx for idx, _ in chiral_centers]
	Hais=[]
	Hais_bt=[]
	Hbd=[]
	# H_existing_bonds = set()
	for bi, ais in b2a.items():#from box overlap
	bt=bond_classes[bi]# in [14,15]#directon bond
	for ai in ais:
	if ai>heavyNumber-1:
	if bt in [14,15]:#directon bond
	Hais.append(ais)#NOTE ais ai increasing order as two for loop increasing
	print(f"within H bond box id {bi} bond direction {idx_to_labels[bt]} atoms box id {ais} ")
	Hais_bt.append(idx_to_labels[bt])
	Hbd.append(bi)
	# print(bonds[bi] )
	# add Hbonds with direction
	H_existing_bonds = set()
	ha2boxa=dict()
	for ais, bt in zip(Hais,Hais_bt):
	idx_2=ais[-1]
	idx_1=ais[0]
	hbond=rwmol_.GetBondBetweenAtoms(idx_1,idx_2)
	if hbond is not None:
	if idx_1 in chiral_center_ids:#if not in the chiral atom, will not set bond directions
	hbond.SetBondDir(BOND_DIRS[bt])
	else:
	had = Chem.Atom("H")
	addHatom_idx = rwmol_.AddAtom(had)
	ha2boxa[addHatom_idx]=idx_2
	# print(idx_2,addHatom_idx)#Note if detected H box will lead idx_2 != addHatom_idx
	atom= rwmol_.GetAtomWithIdx(idx_1)
	max_val=max(VALENCES[atom.GetSymbol()])
	val = sum(b.GetBondTypeAsDouble() for b in atom.GetBonds())
	if (idx_1, addHatom_idx) not in H_existing_bonds and (addHatom_idx, idx_1) not in H_existing_bonds:
	if val<=max_val-1:
	# print(f"atom id {idx_1} val {val} max_val {max_val}")
	print(idx_1, addHatom_idx)#let check bond exist or not!!
	rwmol_.AddBond(idx_1,addHatom_idx, Chem.BondType.SINGLE)#BOND_DIRS[bt]
	b=rwmol_.GetBondBetweenAtoms(idx_1,addHatom_idx)
	if idx_1 in chiral_center_ids:#if not in the chiral atom, will not set bond directions
	b.SetBondDir(BOND_DIRS[bt])#############Note can be done in the following tree
	H_existing_bonds.add((idx_1,addHatom_idx))
	i
	if len(ha2boxa)>0:#consider Hnow
	#use box coords assign 2D, remove extra Hs also update box
	rwmol_.RemoveAllConformers()#
	conf = Chem.Conformer(rwmol_.GetNumAtoms())
	conf.Set3D(True)
	coords2d=[]
	for i, (x, y) in enumerate(heavy_centers):
	position = Point3D(float(x), float(y), 0.) # Create a Point3D object with x, y, and z=0
	conf.SetAtomPosition(i, position)
	coords2d.append([x,y])
	for k,v in ha2boxa.items():
	x,y=atom_centers[v]
	position = Point3D(float(x), float(y), 0.) # Create a Point3D object with x, y, and z=0
	conf.SetAtomPosition(k, position)
	coords2d.append([x,y])
	rwmol_.AddConformer(conf)

	additonalH=detect_unconnected_hydrogens(rwmol_)
	if len(additonalH)>0:
	rwmol_,rmovedAtomcoords=remove_unconnected_hydrogens2(rwmol_) #NOTE 留给将来WEB开发用will dercease h atom,but the box have not updated TODO fix me this in feature activate learning
	#update atom box infors
	if len(rmovedAtomcoords)>0:#update box infors
	delbb=[]
	kdt = cKDTree(atom_centers)
	for i, (x,y,z) in enumerate(rmovedAtomcoords):#z=0
	dist, idx_=kdt.query([x,y], k=1)
	delbb.append(idx_)
	mask = np.ones(len(atom_classes), dtype=bool) # 初始化为 True
	mask[delbb] = False
	atom_bbox = atom_bbox[mask]
	atom_classes = atom_classes[mask]
	atom_centers = atom_centers[mask]
	# mol# mol_rebuit=copy.deepcopy(mol)

	mol=copy.deepcopy(rwmol_)
	conf=mol.GetConformers()[0]
	mola2xy=dict()
	mola2d=[]
	for i,a in enumerate(mol.GetAtoms()):
	x,y,z=conf.GetAtomPosition(i)
	mola2xy[i]=[x,y]
	mola2d.append([x,y])
	# print( x,y,z)
	kdt = cKDTree(mola2d)
	chiral_centers = Chem.FindMolChiralCenters(
	mol, includeUnassigned=True, includeCIP=False, useLegacyImplementation=False)
	chiral_center_ids = [idx for idx, _ in chiral_centers]

	for bi,bcent in enumerate(bond_centers):
	if bi in bondWithdirct :#and bi not in Hbd:#Note as set Hbd previously
	dists, a1a2 = kdt.query(bcent, k=2)
	a1,a2=sorted(a1a2)
	a1,a2=int(a1),int(a2)
	bt= mol.GetBondBetweenAtoms(a1, a2)#RDKit 的键是无向的，返回的是同一个 Bond 对象
	if bt:
	# 获取键的当前起点和终点
	current_begin = bt.GetBeginAtomIdx()
	current_end = bt.GetEndAtomIdx()
	bond_dir=bond_dirs[idx_to_labels[bond_classes[bi]]]
	if bond_dir == rdchem.BondDir.BEGINWEDGE:
	reverse_dir = rdchem.BondDir.BEGINDASH
	elif bond_dir == rdchem.BondDir.BEGINDASH:
	reverse_dir = rdchem.BondDir.BEGINWEDGE
	# else:
	# reverse_dir= rdchem.BondDir.BEGINWEDGE
	if a1 in chiral_center_ids:
	if current_begin == a1:
	bt.SetBondDir(bond_dir)
	print(f'a1 dir')
	else:
	# 如果手性原子是终点，反转方向（例如用相反的楔形键）
	bt.SetBondDir(reverse_dir)
	print(f'a1 reverse_dir')
	# print(f'set bond direction a1a2 {[bi, a1,a2]}')
	# bt.SetBondDir(bond_dirs[idx_to_labels[bond_classes[bi]]])
	elif a2 in chiral_center_ids:
	if current_begin == a2:
	bt.SetBondDir(bond_dir)
	print(f'a2 dir {bond_dir} {reverse_dir}')
	else:
	# 如果手性原子是终点，反转方向（例如用相反的楔形键）,but not work, just remove and add
	mol.RemoveBond(current_begin, current_end)
	mol.AddBond(current_end, current_begin, bt.GetBondType())
	bond = mol.GetBondBetweenAtoms(current_end, current_begin)
	bond.SetBondDir(bond_dir)
	print(f'a2 reverse_dir {bond_dir} {reverse_dir}')
	# bt= mol.GetBondBetweenAtoms(a2, a1)
	# print(f'set bond direction a2a1 {[bi, a2,a1]}')
	# bt.SetBondDir(bond_dirs[idx_to_labels[bond_classes[bi]]])
	else:
	print('bond stro not with chiral atom???, will ignore this stero bond infors')
	print(f"{[bi, bond_dir, current_begin,current_end]}")
	# beginatom=mol.GetAtomWithIdx(current_begin)
	# Endatom=mol.GetAtomWithIdx(current_end)
	# beginatom_neis=len(beginatom.GetBonds())
	# Endatom_neis=len(Endatom.GetBonds())
	try:
	mol_rebuit=mol.GetMol()
	conf = mol_rebuit.GetConformer()
	Chem.WedgeMolBonds(mol_rebuit,conf)#
	Chem.DetectBondStereochemistry(mol_rebuit)
	Chem.AssignChiralTypesFromBondDirs(mol_rebuit)
	Chem.AssignStereochemistry(mol_rebuit)
	#
	smiH=Chem.MolToSmiles(mol_rebuit)
	print(F"smiH\n",smiH)
	# canon_smilesH = Chem.CanonSmiles(smiH)
	# print(F"canon_smilesH\n",canon_smilesH)
	# rdkit_coni_smiH=Chem.MolToSmiles(Chem.MolFromSmiles(smiH))
	# print(f"Chem.MolToSmiles(Chem.MolFromSmiles(smiH))\n {rdkit_coni_smiH}")
	#
	mol = rdkit.Chem.RWMol(mol_rebuit)
	other2ppsocr=True
	if other2ppsocr:
	print()
	need_cut=[]
	ppstr=[]
	ppstr_score=[]
	crops=[]
	index_token=dict()
	expan=0#NOTE this control how much the part of bond in crop_Img
	for i_,(heav_c,heav_box) in enumerate(zip(atom_classes,atom_bbox)):
	if lab2idx['*']==heav_c or lab2idx['other']==heav_c or lab2idx['Cl']==heav_c:
	need_cut.append(i_)
	a=heav_box+np.array([-expan,-expan,expan,expan])
	# print(heav_box.shape,a.shape)
	box=a * [scale_x, scale_y, scale_x, scale_y]#TODO need the fix as w h may not equal!!
	# print(a,box,[scale_x, scale_y, scale_x, scale_y])
	cropped_img = img_ori_1k.crop(box)
	crops.append(cropped_img)
	image_npocr = np.array(cropped_img)
	result_ocr= ocr2.ocr(image_npocr, det=False)
	s_, score_ =result_ocr[0][0]
	s_previos=atom_ocr[i_]
	if s_previos != "other" :
	s_=s_previos if len(s_previos)>=len(s_) else s_
	print(f'ocr::idx:{i_}',s_, score_ )
	if score_<=0.1:# process cropped_img and try again
	# print(s_, "xxx",score_)
	s_='*'
	if s_=='+' or s_=='-':
	s_="*"
	if len(s_)>1:
	s_=re.sub(r'[^a-zA-Z0-9,\*\-\+]', '', s_)#remove special chars
	if re.match(r'^\d+$', s_):
	s_=f'{s_}'#number+
	# print(f'why only numbers ? {s_}')
	if s_=='L':s_='Li'
	elif s_=='0':s_='O'
	elif s_ in ['N,+ CI','N,+ Cl' ,'N,+Cl','N,+CI','N+CI']:s_='N2+Cl-'
	elif s_ in ['NO,','O,N' ]:s_='NO2'


	match = re.match(r'^(\d+)?(.*)', s_)
	# print(s_,'xxxx')
	if match:
	numeric_part, remaining_part = match.groups()
	fc_=mol.GetAtomWithIdx(i_).GetFormalCharge()
	if remaining_part in ELEMENTS:
	new_atom = Chem.Atom(remaining_part)
	mol.ReplaceAtom(i_, new_atom)
	print(i_, remaining_part,"@@@")
	elif remaining_part in ABBREVIATIONS:# can be expanded with placeholder_atoms
	placeholder_atoms[i_]=s_# such 2Na will be get for rdkit
	elif remaining_part=='OH':
	new_atom = Chem.Atom("O")
	mol.ReplaceAtom(i_, new_atom)
	elif remaining_part=='SH':
	new_atom = Chem.Atom("S")
	mol.ReplaceAtom(i_, new_atom)
	elif remaining_part=='NH':
	new_atom = Chem.Atom("N")
	mol.ReplaceAtom(i_, new_atom)
	mol.GetAtomWithIdx(i_).SetFormalCharge(fc_)
	index_token[i_]=f'{s_}:{i_}'
	print(f"idx:{i_}, atm: <{idx_to_labels[heav_c]}> --- [{s_}:{i_}] with score:{score_} \|\|previousOCR:: {atom_ocr[i_]}")
	if s_ in ELEMENTS :
	new_atom = Chem.Atom(s_)
	mol.ReplaceAtom(i_, new_atom)
	mol.GetAtomWithIdx(i_).SetProp("atomLabel", f"{s_}")#mol set with label, mol_rebuit not
	ppstr.append(s_)
	ppstr_score.append(score_)
	if s_ in ABBREVIATIONS.keys():
	placeholder_atoms[i_]=s_
	#
	bond_dirs_rev={v:k for k,v in bond_dirs.items()}
	wdbs=[]
	for b in mol.GetBonds():
	bd=b.GetBondDir()
	bt=b.GetBondType()
	# print(bd)
	if bd ==bond_dirs['BEGINDASH'] or bd==bond_dirs['BEGINWEDGE']:
	a1,a2=b.GetBeginAtomIdx(), b.GetEndAtomIdx()
	wdbs.append([a1,a2,bt,bond_dirs_rev[bd]])

	#expand mol if exists
	# if len(placeholder_atoms)>0:###
	cm=copy.deepcopy(mol)
	# print(placeholder_atoms)
	expand_mol, expand_smiles= expandABB(cm,ABBREVIATIONS, placeholder_atoms)
	SMILESpre=expand_smiles
	rdm=copy.deepcopy(expand_mol)
	target_mol, ref_mol=rdm, cm
	AllChem.Compute2DCoords(target_mol)
	pair=[target_mol, ref_mol]
	mcs=rdFMCS.FindMCS([target_mol, ref_mol], # larger,small order
	# atomCompare=rdFMCS.AtomCompare.CompareAny,
	bondCompare=rdFMCS.BondCompare.CompareAny,
	ringCompare=rdFMCS.RingCompare.IgnoreRingFusion,
	matchChiralTag=False,
	)
	mcs_mol = Chem.MolFromSmarts(mcs.smartsString)
	AllChem.Compute2DCoords(mcs_mol)

	matches0 = pair[0].GetSubstructMatches(mcs_mol, useQueryQueryMatches=True,uniquify=False, maxMatches=1000, useChirality=False)
	matches1 = pair[1].GetSubstructMatches(mcs_mol, useQueryQueryMatches=True,uniquify=False, maxMatches=1000, useChirality=False)
	if len(matches0) != len(matches1):
	matches0=list(matches0)
	matches1=list(matches1)
	# print( "noted: matcher not equal !!")
	if len(matches0)>len(matches1):
	for i in range(0,len(matches0)):
	if i < len(matches1):
	pass
	else:
	ii=i % len(matches1)
	matches1.append(matches1[ii])
	else:
	for i in range(0,len(matches1)):
	if i < len(matches0):
	pass
	else:
	ii=i % len(matches0)
	matches0.append(matches0[ii])
	assert len(matches0) == len(matches1), "matcher not equal break!!"
	atommaping_pairs=[list(zip(matches0[i],matches1[i])) for i in range(0,len(matches0))]
	atomMap=atommaping_pairs[0]
	rmsd2=rdkit.Chem.rdMolAlign.AlignMol(prbMol=target_mol, refMol=ref_mol, atomMap=atomMap,maxIters=2000000)
	print(f"rmsd {rmsd2}")
	#ocr_mol
	ocr_mol = copy.deepcopy(target_mol)
	AllChem.Compute2DCoords(ocr_mol)
	ocr_smi = Chem.MolToSmiles(ocr_mol)
	molexp=ocr_mol
	expandStero_smi, success= rdkit_canonicalize_smiles(ocr_smi)
	# expandStero_smi = Chem.CanonSmiles(ocr_smi)#, useChiral=(not ignore_chiral))

	# TODO #[3H] 2H prpared box for training are too smalled, need adjust
	if visual_check:
	boxed_img = draw_objs(img,
	atom_bbox,
	atom_classes,
	atom_scores,
	category_index=idx_to_labels,
	box_thresh=0.5,
	line_thickness=3,
	font='arial.ttf',
	font_size=10)
	opts = Draw.MolDrawOptions()
	opts.addAtomIndices = False
	opts.addStereoAnnotation = False
	img_ori = Image.open(image_path).convert('RGB')
	img_ori_1k = img_ori.resize((1000,1000))
	if other2ppsocr:
	img_rebuit = Draw.MolToImage(ocr_mol, options=opts,size=(1000, 1000))
	else:
	img_rebuit = Draw.MolToImage(ocr_mol, options=opts,size=(1000, 1000))
	combined_img = Image.new('RGB', (img_ori_1k.width + boxed_img.width + img_rebuit.width, img_ori_1k.height))
	combined_img.paste(img_ori_1k, (0, 0))
	combined_img.paste(boxed_img, (img_ori_1k.width, 0))
	combined_img.paste(img_rebuit, (img_ori_1k.width + boxed_img.width, 0))
	imprefix=os.path.basename(image_path).split('.')[0]
	combined_img.save(f"{ima_checkdir}/{imprefix}Boxed.png")

	new_row = {'file_name':image_path, "SMILESori":SMILESori,
	'SMILESpre':SMILESpre,
	'SMILESexp':expandStero_smi,
	}
	smiles_data = smiles_data._append(new_row, ignore_index=True)

	#accu similarity calculation
	if getacc:
	sameWithOutStero=comparing_smiles(new_row,SMILESpre)#try to ingnore cis chiral, as 2d coords including all the infos
	sameWithOutStero_exp=comparing_smiles(new_row,expandStero_smi)#this ignore chairity and number be NOTE

	if (type(SMILESori)!=type('a')) or (type(SMILESpre)!=type('a')):
	if sameWithOutStero or sameWithOutStero_exp:
	mysum += 1
	else:
	print(f"smiles problems\n{SMILESori}\n{SMILESpre}\n{image_path}")
	failed.append([SMILESori,SMILESpre,image_path])
	mydiff.append([SMILESori,SMILESpre,image_path])
	continue
	mol1 = Chem.MolFromSmiles(SMILESori)#TODO considering smiles with rdkit not recongized in real data
	if mol1 is None:
	rd_smi_ori, success1_=rdkit_canonicalize_smiles(SMILESori)
	mol1=Chem.MolFromSmiles(rd_smi_ori)
	if (mol_rebuit is None) or (mol1 is None):
	if sameWithOutStero or sameWithOutStero_exp:
	mysum += 1
	else:
	print(f'get rdkit mol None\n{SMILESori}\n{SMILESpre}\n{image_path}')
	failed.append([SMILESori,SMILESpre,image_path])
	mydiff.append([SMILESori,SMILESpre,image_path])
	continue
	if mol1:
	rdk_smi1=Chem.MolToSmiles(mol1)
	else:
	rdk_smi1=SMILESori
	if mol_rebuit:
	rdk_smi2=Chem.MolToSmiles(mol_rebuit)
	else:
	rdk_smi2=''
	# if rdk_smi1==rdk_smi2 or rdk_smi1==expandStero_smi or sameWithOutStero:#also considering the abbre in Ori
	if rdk_smi1==rdk_smi2 or rdk_smi1==expandStero_smi:
	mysum += 1
	else:
	if sameWithOutStero or sameWithOutStero_exp:
	mysum += 1
	else:
	mydiff.append([SMILESori,SMILESpre,image_path])
	if visual_check:
	combined_img.save(f"{ima_checkdir}/{imprefix}Boxed_diff{len(mydiff)}.png")
	try:
	morganfps1 = AllChem.GetMorganFingerprint(mol1, 3,useChirality=True)
	morganfps2 = AllChem.GetMorganFingerprint(mol_rebuit, 3,useChirality=True)
	morgan_tani = DataStructs.DiceSimilarity(morganfps1, morganfps2)
	fp1 = Chem.RDKFingerprint(mol1)
	fp2 = Chem.RDKFingerprint(mol_rebuit)
	tanimoto = DataStructs.FingerprintSimilarity(fp1, fp2)
	if expandStero_smi!= '':
	fp3 = Chem.RDKFingerprint(molexp)
	morganfps3 = AllChem.GetMorganFingerprint(molexp, 3,useChirality=True)
	morgan_tani3 = DataStructs.DiceSimilarity(morganfps1, morganfps3)
	tanimoto3 = DataStructs.FingerprintSimilarity(fp1, fp3)
	if morgan_tani3> morgan_tani or tanimoto3> tanimoto :
	sim+=morgan_tani3
	simRD+=tanimoto3
	else:
	simRD+=tanimoto
	sim+=morgan_tani
	except Exception as e:
	print(f"mol to fingerprint erros")
	simRD+=0
	sim+=0
	continue
	except Exception as e:
	print(f"file_name@: {image_path}\n SMILES in csv:\n{SMILESori}")
	raise Exception("@debug this!!\n")

	if getacc:
	sim_100 = 100*sim/len(smiles_data)
	simrd100 = 100*simRD/len(smiles_data)
	flogout.write(f"rdkit concanlized==smiles:{100*mysum/len(smiles_data)}%\n")
	flogout.write(f"failed:{len(failed)}\n totoal saved in csv : {len(smiles_data)}\n")
	flogout.write(f"avarage similarity morgan tanimoto: RDKFp tanimoto:: {sim_100}%, {simrd100}% \n")#morgan_tani considering chiraty
	flogout.write(f'I2M@@:: match--{mysum},unmatch--{len(mydiff)},failed--{len(failed)},correct %{100*mysum/len(smiles_data)} \n')
	#molscribe evalutate
	from src.solver.evaluate import SmilesEvaluator
	evaluator = SmilesEvaluator(smiles_data['SMILESori'], tanimoto=False)
	res_pre=evaluator.evaluate(smiles_data['SMILESpre'])
	res_exp=evaluator.evaluate(smiles_data['SMILESexp'])
	flogout.write(f'MolScribe style evaluation@SMILESpre:: {str(res_pre)} \n')
	flogout.write(f'MolScribe style evaluation@SMILESexp:: {str(res_exp)} \n')
	flogout.close()
	print(f"will save {len(smiles_data)} dataframe into csv")
	smiles_data.to_csv(outcsv_filename, index=False)


	import torch.nn as nn
	import torch.nn.functional as F
	import torchvision


	class RTDETRPostProcessor(nn.Module):
	__share__ = ['num_classes', 'use_focal_loss', 'num_top_queries', 'remap_mscoco_category']

	def __init__(self, classes_dict=None, use_focal_loss=True, num_top_queries=300, remap_mscoco_category=False) -> None:
	super().__init__()
	self.use_focal_loss = use_focal_loss
	if classes_dict is None:
	classes_dict = {0:'other',1:'C',2:'O',3:'N',4:'Cl',5:'Br',6:'S',7:'F',8:'B',
	9:'I',10:'P',11:'H',12:'Si',
	#bond
	13:'single',14:'wdge',15:'dash',
	16:'=',17:'#',18:':',#aromatic
	#charge
	19:'-4',20:'-2',
	21:'-1',#-
	22:'+1',#+
	23:'+2',
	}
	num_classes=len(classes_dict)
	self.num_top_queries = num_top_queries
	self.num_classes = num_classes
	self.remap_mscoco_category = remap_mscoco_category
	self.deploy_mode = False

	mscoco_category2label = {k: i for i, k in enumerate(classes_dict.keys())}
	mscoco_label2category = {v: k for k, v in mscoco_category2label.items()}
	self.mscoco_label2category=mscoco_label2category

	def extra_repr(self) -> str:
	return f'use_focal_loss={self.use_focal_loss}, num_classes={self.num_classes}, num_top_queries={self.num_top_queries}'

	# def forward(self, outputs, orig_target_sizes):
	def forward(self, outputs, orig_target_sizes):

	logits, boxes = outputs['pred_logits'], outputs['pred_boxes']
	# orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0)

	bbox_pred = torchvision.ops.box_convert(boxes, in_fmt='cxcywh', out_fmt='xyxy')
	bbox_pred *= orig_target_sizes.repeat(1, 2).unsqueeze(1)

	if self.use_focal_loss:
	scores = F.sigmoid(logits)
	scores, index = torch.topk(scores.flatten(1), self.num_top_queries, axis=-1)
	labels = index % self.num_classes
	index = index // self.num_classes
	boxes = bbox_pred.gather(dim=1, index=index.unsqueeze(-1).repeat(1, 1, bbox_pred.shape[-1]))

	else:
	scores = F.softmax(logits)[:, :, :-1]
	scores, labels = scores.max(dim=-1)
	boxes = bbox_pred
	if scores.shape[1] > self.num_top_queries:
	scores, index = torch.topk(scores, self.num_top_queries, dim=-1)
	labels = torch.gather(labels, dim=1, index=index)
	boxes = torch.gather(boxes, dim=1, index=index.unsqueeze(-1).tile(1, 1, boxes.shape[-1]))

	# TODO for onnx export
	if self.deploy_mode:
	return labels, boxes, scores

	# TODO
	if self.remap_mscoco_category:
	# from ...data.coco import mscoco_label2category
	labels = torch.tensor([self.mscoco_label2category[int(x.item())] for x in labels.flatten()])\
	.to(boxes.device).reshape(labels.shape)

	results = []
	for lab, box, sco in zip(labels, boxes, scores):
	result = dict(labels=lab, boxes=box, scores=sco)
	results.append(result)

	return results


	def deploy(self, ):
	self.eval()
	self.deploy_mode = True
	return self

	@property
	def iou_types(self, ):
	return ('bbox', )