import json import logging import re from bs4 import BeautifulSoup from collections import defaultdict from typing import Dict, List # TODO: review the functions here def extract_all_tagged_phrases(text: str) -> Dict[str, List[str]]: soup = BeautifulSoup(text, "html.parser") tagged_phrases = defaultdict(list) for tag in soup.find_all(True): if tag.name: # Clean and process the text full_text = " ".join(tag.stripped_strings) full_text = re.sub(r"\s+", " ", full_text.strip()) full_text = re.sub(r'(? dict: pred = {} if kind == "json": json_match = re.search(r"\{[\s\S]+\}", prediction) if json_match: json_str = json_match.group(0) json_str = re.sub(r"(\w+)-\$?\\?(\w+)\$?", r"\1-\2", json_str) json_str = json_str.replace('\\"', '"') json_str = re.sub(r'}\s*"', '}, "', json_str) json_str = re.sub(r']\s*"', '], "', json_str) try: pred = json.loads(json_str) except json.JSONDecodeError as e: logging.warning(f"Failed to parse JSON: {json_str}") logging.warning(f"Error: {str(e)}") try: json_str = re.sub(r",\s*([}\]])", r"\1", json_str) json_str = re.sub(r"(?