From 4b26fc48948b633dba07c0d26773756140722da5 Mon Sep 17 00:00:00 2001 From: Badiboy Date: Sat, 14 Feb 2026 22:23:23 +0300 Subject: [PATCH 1/8] New message entities parsers --- telebot/formatting.py | 687 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 681 insertions(+), 6 deletions(-) diff --git a/telebot/formatting.py b/telebot/formatting.py index 0e300f7c1..2f064062e 100644 --- a/telebot/formatting.py +++ b/telebot/formatting.py @@ -1,7 +1,5 @@ """ Markdown & HTML formatting functions. - -.. versionadded:: 4.5.1 """ import re @@ -9,6 +7,15 @@ from typing import Optional, List, Dict +# Alternative message entities parsers. Can be: +# "deepseek" - deepseek version +# "gemini" - gemini version +# "chatgpt" - chatgpt version +# "coder" - @coder2020official version +# other values - original version +ENTITY_PARSER_MODE = None + + def format_text(*args, separator="\n"): """ Formats a list of strings into a single string. @@ -45,6 +52,7 @@ def escape_html(content: str) -> str: return html.escape(content) +# noinspection RegExpRedundantEscape def escape_markdown(content: str) -> str: """ Escapes Markdown characters in a string of Markdown. @@ -269,6 +277,9 @@ def mcode(content: str, language: str="", escape: Optional[bool]=True) -> str: :param content: The string to code. :type content: :obj:`str` + :param language: The programming language of the code. Defaults to an empty string. + :type language: :obj:`str` + :param escape: True if you need to escape special characters. Defaults to True. :type escape: :obj:`bool` @@ -304,6 +315,9 @@ def hpre(content: str, escape: Optional[bool]=True, language: str="") -> str: :param escape: True if you need to escape special characters. Defaults to True. :type escape: :obj:`bool` + :param language: The programming language of the code. Defaults to an empty string. + :type language: :obj:`str` + :return: The formatted string. :rtype: :obj:`str` """ @@ -392,6 +406,14 @@ def apply_html_entities(text: str, entities: Optional[List], custom_subs: Option ) >> "Test parse formatting, url and text_mention and mention @username" """ + if ENTITY_PARSER_MODE == "deepseek": + return apply_html_entities_ds(text, entities, custom_subs) + elif ENTITY_PARSER_MODE == "gemini": + return apply_html_entities_gm(text, entities, custom_subs) + elif ENTITY_PARSER_MODE == "chatgpt": + return apply_html_entities_cg(text, entities, custom_subs) + elif ENTITY_PARSER_MODE == "coder": + return apply_html_entities_coder(text, entities, custom_subs) if not entities: return text.replace("&", "&").replace("<", "<").replace(">", ">") @@ -401,7 +423,6 @@ def apply_html_entities(text: str, entities: Optional[List], custom_subs: Option "italic": "{text}", "pre": "
{text}
", "code": "{text}", - # "url": "{text}", # @badiboy plain URLs have no text and do not need tags "text_link": "{text}", "strikethrough": "{text}", "underline": "{text}", @@ -409,7 +430,6 @@ def apply_html_entities(text: str, entities: Optional[List], custom_subs: Option "custom_emoji": "{text}", "blockquote": "
{text}
", "expandable_blockquote": "
{text}
", - } if custom_subs: @@ -423,8 +443,8 @@ def func(upd_text, subst_type=None, url=None, user=None, custom_emoji_id=None, l if subst_type == "text_mention": subst_type = "text_link" url = "tg://user?id={0}".format(user.id) - elif subst_type == "mention": - url = "https://t.me/{0}".format(upd_text[1:]) + # elif subst_type == "mention": + # url = "https://t.me/{0}".format(upd_text[1:]) upd_text = upd_text.replace("&", "&").replace("<", "<").replace(">", ">") if not subst_type or not _subs.get(subst_type): return upd_text @@ -477,3 +497,658 @@ def func(upd_text, subst_type=None, url=None, user=None, custom_emoji_id=None, l html_text += func(utf16_text[offset * 2:]) return html_text + + +#region DeepSeek vibecoding here +class EntityProcessor: + """ + Handles parsing of text with message entities to HTML. + """ + + # Entity type to HTML template mapping + ENTITY_TEMPLATES = { + "bold": "{text}", + "italic": "{text}", + "pre": "
{text}
", + "code": "{text}", + "text_link": "{text}", + "strikethrough": "{text}", + "underline": "{text}", + "spoiler": "{text}", + "custom_emoji": "{text}", + "blockquote": "
{text}
", + "expandable_blockquote": "
{text}
", + } + + def __init__(self, text: str, custom_subs: Optional[Dict[str, str]] = None): + self.text = text + self.utf16_mapping = self.utf16_code_units_to_indices(text) + self.total_utf16_units = len(self.utf16_mapping) + self.custom_subs = custom_subs + + def check_entity_exists(self, entity_type: str) -> bool: + """ + Check if an entity type has a defined HTML template, considering custom substitutions. + """ + return (entity_type in self.ENTITY_TEMPLATES) or (self.custom_subs and (entity_type in self.custom_subs)) + + def get_entity_template(self, entity_type: str, default: Optional[str] = None) -> Optional[str]: + """ + Get the HTML template for a given entity type, considering custom substitutions. + """ + if entity_type in self.ENTITY_TEMPLATES: + return self.ENTITY_TEMPLATES[entity_type] + elif self.custom_subs and (entity_type in self.custom_subs): + return self.custom_subs[entity_type] + else: + return default + + @staticmethod + def utf16_code_units_to_indices(text: str) -> List[int]: + """ + Convert UTF-16 code unit positions to Python string indices. + + Returns: + code_unit_to_char_idx: Mapping from UTF-16 code unit position to character index + """ + code_unit_to_char_idx = [] + + code_unit_pos = 0 + for char_idx, char in enumerate(text): + code_point = ord(char) + # Characters outside BMP (U+10000 to U+10FFFF) use 2 UTF-16 code units + if code_point >= 0x10000: + code_units = 2 + else: + code_units = 1 + + # Map this code unit position to character index + for _ in range(code_units): + code_unit_to_char_idx.append(char_idx) + + code_unit_pos += code_units + + return code_unit_to_char_idx + + def utf16_to_char_index(self, utf16_pos: int) -> int: + """ + Convert UTF-16 code unit position to character index. + """ + if utf16_pos >= len(self.utf16_mapping): + return len(self.text) + return self.utf16_mapping[utf16_pos] + + def get_entity_text(self, entity) -> str: # entity: MessageEntity + """ + Extract the text for an entity using UTF-16 code unit offsets. + """ + start_char = self.utf16_to_char_index(entity.offset) + end_char = self.utf16_to_char_index(entity.offset + entity.length) + return self.text[start_char:end_char] + + def create_html_tag(self, entity, content: str) -> str: # entity: MessageEntity + """ + Create HTML tag for an entity with the given content. + """ + entity_type = entity.type + + template = self.get_entity_template(entity_type) + if not template: + return content + + # Prepare format arguments + format_args = {"text": content} + if entity_type == "text_mention": + template = self.get_entity_template("text_link") + format_args["url"] = "tg://user?id={0}".format(entity.user.id) + elif entity_type == "text_link": + format_args["url"] = escape_html(entity.url or "") + elif entity_type == "custom_emoji": + format_args["custom_emoji_id"] = entity.custom_emoji_id or "" + elif entity_type == "pre" and entity.language: + format_args["text"] = '{}'.format(entity.language, format_args["text"]) + + return template.format(**format_args) + +def apply_html_entities_ds(text: str, entities: Optional[List], # entities: Optional[List[MessageEntity]] + custom_subs: Optional[Dict[str, str]] = None) -> str: + """ + Parse text message to HTML code according to message entities. + Properly handles UTF-16 code units for offsets and nested entities. + + Args: + text: Plain text message + entities: List of MessageEntity objects + custom_subs: Optional mapping of entity types to custom HTML substitutions/templates. + + Returns: + HTML formatted string + """ + if not text: + return text + elif not entities: + return text.replace("&", "&").replace("<", "<").replace(">", ">") + + processor = EntityProcessor(text, custom_subs=custom_subs) + + # Sort entities by their position in the text + # For proper nesting handling, we need to process from the end + sorted_entities = sorted(entities, key=lambda e: e.offset, reverse=True) + + # Build a tree structure of entities + # First, convert UTF-16 offsets to character indices for easier processing + entity_ranges = [] + for entity in sorted_entities: + if not processor.check_entity_exists(entity.type): + continue + + start_char = processor.utf16_to_char_index(entity.offset) + end_char = processor.utf16_to_char_index(entity.offset + entity.length) + + entity_ranges.append({ + 'entity': entity, + 'start': start_char, + 'end': end_char, + 'type': entity.type, + }) + + # Sort by start position (ascending) and then by length (descending) + # This ensures parent entities come before children + entity_ranges.sort(key=lambda x: (x['start'], -x['end'])) + + # Build the HTML recursively + def process_range(start_idx: int, end_idx: int, entities_in_range: List[dict]) -> str: + """ + Recursively process a text range with its entities. + """ + if not entities_in_range: + return text[start_idx:end_idx] + + # Group entities by their start position + result_parts = [] + current_pos = start_idx + + # Sort entities by their start position + entities_in_range.sort(key=lambda x: x['start']) + + i = 0 + while i < len(entities_in_range): + cur_entity = entities_in_range[i] + + # Add text before this entity + if cur_entity['start'] > current_pos: + result_parts.append(text[current_pos:cur_entity['start']]) + + # Find all entities that start at the same position or are nested within + nested_entities = [] + j = i + while j < len(entities_in_range) and entities_in_range[j]['start'] < cur_entity['end']: + if entities_in_range[j]['start'] >= cur_entity['start']: + nested_entities.append(entities_in_range[j]) + j += 1 + + # Filter entities that are actually within this entity's range + nested_entities = [e for e in nested_entities if + e['start'] >= cur_entity['start'] and e['end'] <= cur_entity['end']] + + # Process the content of this entity (including nested entities) + content = process_range(cur_entity['start'], cur_entity['end'], + [e for e in nested_entities if e != cur_entity]) + + # Apply this entity's HTML tag + html_content = processor.create_html_tag(cur_entity['entity'], content) + result_parts.append(html_content) + + # Move current position to the end of this entity + current_pos = cur_entity['end'] + i = j + + # Add remaining text + if current_pos < end_idx: + result_parts.append(text[current_pos:end_idx]) + + return ''.join(result_parts) + + # Process the entire text + return process_range(0, len(text), entity_ranges) +#endregion + +#region Gemini vibecoding here +def apply_html_entities_gm( + text: str, + entities: Optional[List], # entities: Optional[List[MessageEntity]] + custom_subs: Optional[Dict[str, str]] = None +) -> str: + # if not entities: + # return html.escape(text) + if not text: + return text + elif not entities: + return text.replace("&", "&").replace("<", "<").replace(">", ">") + + # --- Step 1: Map UTF-16 offsets to Python String Indices --- + # Telegram API uses UTF-16 code units for offsets/length. + # Python strings are indexed by Unicode code points. + # We need to map: utf16_offset -> python_string_index + + # Identify all 'significant' UTF-16 boundaries we care about (start and end of every entity) + boundaries = set() + for e in entities: + boundaries.add(e.offset) + boundaries.add(e.offset + e.length) + + # Sort them to iterate through the text linearly + sorted_boundaries = sorted(list(boundaries)) + boundary_map = {} # Maps utf16_offset -> python_index + + current_utf16_len = 0 + boundary_idx = 0 + + # Iterate over the string code point by code point + for py_index, char in enumerate(text): + # If we reached a boundary, record the mapping + while boundary_idx < len(sorted_boundaries) and current_utf16_len == sorted_boundaries[boundary_idx]: + boundary_map[sorted_boundaries[boundary_idx]] = py_index + boundary_idx += 1 + + if boundary_idx >= len(sorted_boundaries): + break + + # Advance UTF-16 counter + # BMP characters (<= 0xFFFF) take 1 unit. Non-BMP (surrogates) take 2 units. + if ord(char) > 0xFFFF: + current_utf16_len += 2 + else: + current_utf16_len += 1 + + # Handle boundaries that fall exactly at the end of the string + while boundary_idx < len(sorted_boundaries) and current_utf16_len == sorted_boundaries[boundary_idx]: + boundary_map[sorted_boundaries[boundary_idx]] = len(text) + boundary_idx += 1 + + # --- Step 2: Create Markers --- + # We transform entities into "Insert Start Tag" and "Insert End Tag" markers. + markers = [] + + for e in entities: + if e.offset not in boundary_map or (e.offset + e.length) not in boundary_map: + continue # Skip invalid entities + + start_py = boundary_map[e.offset] + end_py = boundary_map[e.offset + e.length] + + # Structure: (Index, Type, Priority, Entity) + # Type: 1 = Start Tag, 0 = End Tag. + # Priority: Used to ensure correct nesting (Outer tags wrap Inner tags). + # - For Start Tags (1): Larger length = Higher priority (Process earlier). + # We use negative length so 'smaller' number comes first in ASC sort. + # - For End Tags (0): Smaller length = Higher priority (Process earlier). + + # Start Marker + markers.append((start_py, 1, -e.length, e)) + + # End Marker + markers.append((end_py, 0, e.length, e)) + + # --- Step 3: Sort Markers --- + # Primary Key: Index (asc) + # Secondary Key: Type (End tags (0) before Start tags (1) at same index) -> This fixes vs + # Tertiary Key: Priority (Length based nesting) + + # FIX: We use a lambda key to avoid comparing the 'e' (MessageEntity) object directly + markers.sort(key=lambda x: (x[0], x[1], x[2])) + + # --- Step 4: Build HTML --- + result = [] + text_ptr = 0 + stack = [] # To track currently open entities + + for index, tag_type, _, entity in markers: + # 1. Append text leading up to this marker + if index > text_ptr: + result.append(html.escape(text[text_ptr:index])) + text_ptr = index + + # 2. Get the HTML tag representation + tag = get_html_tag(entity, custom_subs) + if not tag: + continue + + if tag_type == 1: # START TAG + result.append(tag['open']) + stack.append(entity) + + else: # END TAG + # If stack is empty (shouldn't happen in valid data), ignore + if not stack: + continue + + # If the entity to close is at the top of the stack, close it normally + if stack[-1] == entity: + result.append(tag['close']) + stack.pop() + else: + # INTERSECTING ENTITIES DETECTED + # We need to close everything down to our entity, then reopen them + if entity in stack: + temp_stack = [] + + # Pop and close until we find the target + while stack[-1] != entity: + top_entity = stack.pop() + top_tag = get_html_tag(top_entity, custom_subs) + if top_tag: + result.append(top_tag['close']) + temp_stack.append(top_entity) + + # Close the target entity + result.append(tag['close']) + stack.pop() + + # Re-open the temporarily closed entities (in reverse order to preserve nesting) + for popped_entity in reversed(temp_stack): + p_tag = get_html_tag(popped_entity, custom_subs) + if p_tag: + result.append(p_tag['open']) + stack.append(popped_entity) + + # Append remaining text + if text_ptr < len(text): + result.append(html.escape(text[text_ptr:])) + + return "".join(result) + + +def get_html_tag(entity, custom_subs: Optional[Dict[str, str]]) -> Optional[Dict[str, str]]: # entity: MessageEntity + """Helper to get open/close tags based on entity type.""" + + # Check custom subs first (basic implementation: if type in dict, return it as open tag) + # Note: The prompt implies full substitutions, but simple key-value usually implies 'open' tag or full format. + # Given the complexity of closing tags, we stick to the Prompt's Rules for known types. + + t = entity.type + if t == "bold": + return {'open': "", 'close': ""} + elif t == "italic": + return {'open': "", 'close': ""} + elif t == "underline": + return {'open': "", 'close': ""} + elif t == "strikethrough": + return {'open': "", 'close': ""} + elif t == "spoiler": + return {'open': '', 'close': ""} + elif t == "code": + return {'open': "", 'close': ""} + elif (t == "pre") and entity.language: + return {'open': f'
', 'close': "
"} + elif t == "pre": + return {'open': "
", 'close': "
"} + elif t == "blockquote": + return {'open': "
", 'close': "
"} + elif t == "expandable_blockquote": + return {'open': "
", 'close': "
"} + elif t == "text_link": + return {'open': f'', 'close': ""} + elif t == "text_mention": + return {'open': f'', 'close': ""} + elif t == "custom_emoji": + return {'open': f'', 'close': ""} + elif custom_subs and (t in custom_subs): + # Support custom substitutions by splitting the template around the {text} placeholder + template = custom_subs[t] + if "{text}" in template: + open_part, close_part = template.split("{text}", 1) + else: + # If no {text} placeholder is present, treat the entire template as the opening part + open_part, close_part = template, "" + return {'open': open_part, 'close': close_part} + + return None +#endregion + +#region ChatGPT vibecoding here +ENTITY_TEMPLATES_CG = { + "bold": "{text}", + "italic": "{text}", + "pre": "
{text}
", + "code": "{text}", + "text_link": "{text}", + "strikethrough": "{text}", + "underline": "{text}", + "spoiler": "{text}", + "custom_emoji": "{text}", + "blockquote": "
{text}
", + "expandable_blockquote": "
{text}
", +} + +def utf16_index_map(s: str) -> List[int]: + """ + Map UTF-16 code unit index -> Python string index. + Result length = utf16_len + 1 + """ + mapping = [0] + u16 = 0 + for i, ch in enumerate(s): + code = ord(ch) + u16 += 2 if code > 0xFFFF else 1 + while len(mapping) <= u16: + mapping.append(i + 1) + return mapping + +def apply_template(entity, inner: str, custom_subs: Optional[Dict[str, str]]) -> str: + t = entity.type + if t in ENTITY_TEMPLATES_CG: + tpl = ENTITY_TEMPLATES_CG[t] + elif custom_subs and t in custom_subs: + tpl = custom_subs[t] + else: + return inner + + data = {"text": inner} + + if t == "text_link": + data["url"] = getattr(entity, "url", "") + elif t == "text_mention": + data["url"] = f"tg://user?id={getattr(entity, 'user', {}).id if getattr(entity, 'user', None) else ''}" + elif t == "custom_emoji": + data["custom_emoji_id"] = getattr(entity, "custom_emoji_id", "") + elif (t == "pre") and getattr(entity, "language", None): + data["text"] = f'{inner}' + + return tpl.format(**data) + +def build_tree(entities: List, mapping: List[int]): + nodes = [] + + for e in entities: + start16 = e.offset + end16 = e.offset + e.length + + start = mapping[start16] + end = mapping[end16] + + nodes.append({ + "entity": e, + "start": start, + "end": end, + "children": [] + }) + + nodes.sort(key=lambda node: (node["start"], -node["end"])) + + stack = [] + roots = [] + + for n in nodes: + while stack and n["start"] >= stack[-1]["end"]: + stack.pop() + + if stack: + stack[-1]["children"].append(n) + else: + roots.append(n) + + stack.append(n) + + return roots + +def render(text: str, nodes, custom_subs): + result = [] + pos = 0 + + for n in nodes: + result.append(text[pos:n["start"]]) + + inner = render( + text[n["start"]:n["end"]], + shift_nodes(n["children"], n["start"]), + custom_subs + ) + + wrapped = apply_template(n["entity"], inner, custom_subs) + result.append(wrapped) + + pos = n["end"] + + result.append(text[pos:]) + return "".join(result) + +def shift_nodes(nodes, shift): + out = [] + for n in nodes: + out.append({ + "entity": n["entity"], + "start": n["start"] - shift, + "end": n["end"] - shift, + "children": shift_nodes(n["children"], shift), + }) + return out + +def apply_html_entities_cg( + text: str, + entities: Optional[List], + custom_subs: Optional[Dict[str, str]] +) -> str: + if not text: + return text + elif not entities: + return text.replace("&", "&").replace("<", "<").replace(">", ">") + + mapping = utf16_index_map(text) + tree = build_tree(entities, mapping) + return render(text, tree, custom_subs) +#endregion + +def apply_html_entities_coder(text: str, entities=None, custom_subs=None) -> str: + """ + Apply HTML formatting to text based on provided entities. + Handles nested and overlapping entities correctly. + """ + if not entities: + return text.replace("&", "&").replace("<", "<").replace(">", ">") + + _subs_c = { + "bold": "{text}", + "italic": "{text}", + "pre": "
{text}
", + "code": "{text}", + "text_link": "{text}", + "strikethrough": "{text}", + "underline": "{text}", + "spoiler": "{text}", + "custom_emoji": "{text}", + "blockquote": "
{text}
", + "expandable_blockquote": "
{text}
", + } + + if custom_subs: + for key, value in custom_subs.items(): + _subs_c[key] = value + + # Sort entities by offset (starting position), with longer entities first for equal offsets + sorted_entities = sorted(entities, key=lambda e: (e.offset, -e.length)) + + # Convert text to utf-16 encoding for proper handling + utf16_text = text.encode("utf-16-le") + + def format_entity(entity, content): + """Apply entity formatting to the content""" + entity_type = entity.type + + # Handle different entity types + if entity_type == "text_mention" and hasattr(entity, 'user'): + return f"{content}" + # elif entity_type == "mention": # No need to do this, @username works fine + # username = content[1:] # Remove @ symbol + # return f"{content}" + elif entity_type == "text_link" and hasattr(entity, 'url'): + return f"{content}" + elif entity_type == "custom_emoji" and hasattr(entity, 'custom_emoji_id'): + return f"{content}" + elif entity_type == "pre" and hasattr(entity, 'language') and entity.language: + return f"
{content}
" + elif entity_type in _subs_c: + template = _subs_c[entity_type] + return template.format(text=content) + + # If no matching entity type, return text as is + return content + + def process_entities(byte_text, entity_list, start_pos=0, end_pos=None): + if end_pos is None: + end_pos = len(byte_text) + + if not entity_list or start_pos >= end_pos: + return escape_html(byte_text[start_pos:end_pos]) + + current_entity = entity_list[0] + current_start = current_entity.offset * 2 + current_end = current_start + current_entity.length * 2 + + if current_end <= start_pos or current_start >= end_pos: + return escape_html(byte_text[start_pos:end_pos]) + + result = [] + + if current_start > start_pos: + result.append(escape_html(byte_text[start_pos:current_start])) + + nested_entities = [] + remaining_entities = [] + + for entity in entity_list[1:]: + entity_start = entity.offset * 2 + # entity_end = entity_start + entity.length * 2 + + if current_start <= entity_start < current_end: + nested_entities.append(entity) + else: + remaining_entities.append(entity) + + if nested_entities: + inner_content = process_entities( + byte_text, + nested_entities, + current_start, + current_end + ) + else: + inner_content = escape_html(byte_text[current_start:current_end]) + + result.append(format_entity(current_entity, inner_content)) + + if current_end < end_pos and remaining_entities: + result.append(process_entities( + byte_text, + remaining_entities, + current_end, + end_pos + )) + elif current_end < end_pos: + result.append(escape_html(byte_text[current_end:end_pos])) + + return "".join(result) + + html_result = process_entities(utf16_text, sorted_entities) + + return html_result From cd54f2e1e56b500a7d0a75d247face39ce69fcda Mon Sep 17 00:00:00 2001 From: Badiboy Date: Sat, 14 Feb 2026 22:33:27 +0300 Subject: [PATCH 2/8] Fix escape_html to escape_entity --- telebot/formatting.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/telebot/formatting.py b/telebot/formatting.py index 2f064062e..14c00c3c9 100644 --- a/telebot/formatting.py +++ b/telebot/formatting.py @@ -1071,6 +1071,12 @@ def apply_html_entities_coder(text: str, entities=None, custom_subs=None) -> str # Convert text to utf-16 encoding for proper handling utf16_text = text.encode("utf-16-le") + def escape_entity(text_part): + """Escape HTML special characters in a text part""" + if isinstance(text_part, bytes): + text_part = text_part.decode("utf-16-le") + return text_part.replace("&", "&").replace("<", "<").replace(">", ">") + def format_entity(entity, content): """Apply entity formatting to the content""" entity_type = entity.type @@ -1099,19 +1105,19 @@ def process_entities(byte_text, entity_list, start_pos=0, end_pos=None): end_pos = len(byte_text) if not entity_list or start_pos >= end_pos: - return escape_html(byte_text[start_pos:end_pos]) + return escape_entity(byte_text[start_pos:end_pos]) current_entity = entity_list[0] current_start = current_entity.offset * 2 current_end = current_start + current_entity.length * 2 if current_end <= start_pos or current_start >= end_pos: - return escape_html(byte_text[start_pos:end_pos]) + return escape_entity(byte_text[start_pos:end_pos]) result = [] if current_start > start_pos: - result.append(escape_html(byte_text[start_pos:current_start])) + result.append(escape_entity(byte_text[start_pos:current_start])) nested_entities = [] remaining_entities = [] @@ -1133,7 +1139,7 @@ def process_entities(byte_text, entity_list, start_pos=0, end_pos=None): current_end ) else: - inner_content = escape_html(byte_text[current_start:current_end]) + inner_content = escape_entity(byte_text[current_start:current_end]) result.append(format_entity(current_entity, inner_content)) @@ -1145,7 +1151,7 @@ def process_entities(byte_text, entity_list, start_pos=0, end_pos=None): end_pos )) elif current_end < end_pos: - result.append(escape_html(byte_text[current_end:end_pos])) + result.append(escape_entity(byte_text[current_end:end_pos])) return "".join(result) From 2cebc8eb732e34b49bd95b23eef3cceae779689f Mon Sep 17 00:00:00 2001 From: Badiboy Date: Sat, 21 Feb 2026 12:30:59 +0300 Subject: [PATCH 3/8] Apply new entities processing --- telebot/formatting.py | 744 ++++++------------------------------------ 1 file changed, 95 insertions(+), 649 deletions(-) diff --git a/telebot/formatting.py b/telebot/formatting.py index 14c00c3c9..95e81db0f 100644 --- a/telebot/formatting.py +++ b/telebot/formatting.py @@ -7,15 +7,6 @@ from typing import Optional, List, Dict -# Alternative message entities parsers. Can be: -# "deepseek" - deepseek version -# "gemini" - gemini version -# "chatgpt" - chatgpt version -# "coder" - @coder2020official version -# other values - original version -ENTITY_PARSER_MODE = None - - def format_text(*args, separator="\n"): """ Formats a list of strings into a single string. @@ -382,43 +373,15 @@ def hcite(content: str, escape: Optional[bool] = True, expandable: Optional[bool ) -def apply_html_entities(text: str, entities: Optional[List], custom_subs: Optional[Dict[str, str]]) -> str: +def apply_html_entities(text: str, entities=None, custom_subs=None) -> str: """ - Author: @sviat9440 - Updaters: @badiboy, @EgorKhabarov - Message: "*Test* parse _formatting_, [url](https://example.com), [text_mention](tg://user?id=123456) and mention @username" - - .. code-block:: python3 - :caption: Example: - - apply_html_entities(text, entities) - >> "Test parse formatting, url, text_mention and mention @username" - - Custom subs: - You can customize the substitutes. By default, there is no substitute for the entities: hashtag, bot_command, email. You can add or modify substitute an existing entity. - .. code-block:: python3 - :caption: Example: - - apply_html_entities( - text, - entities, - {"bold": "{text}", "italic": "{text}", "mention": "{text}"}, - ) - >> "Test parse formatting, url and text_mention and mention @username" + Apply HTML formatting to text based on provided entities. + Handles nested and overlapping entities correctly. """ - if ENTITY_PARSER_MODE == "deepseek": - return apply_html_entities_ds(text, entities, custom_subs) - elif ENTITY_PARSER_MODE == "gemini": - return apply_html_entities_gm(text, entities, custom_subs) - elif ENTITY_PARSER_MODE == "chatgpt": - return apply_html_entities_cg(text, entities, custom_subs) - elif ENTITY_PARSER_MODE == "coder": - return apply_html_entities_coder(text, entities, custom_subs) - if not entities: return text.replace("&", "&").replace("<", "<").replace(">", ">") - _subs = { + _subs_c = { "bold": "{text}", "italic": "{text}", "pre": "
{text}
", @@ -434,69 +397,101 @@ def apply_html_entities(text: str, entities: Optional[List], custom_subs: Option if custom_subs: for key, value in custom_subs.items(): - _subs[key] = value + _subs_c[key] = value + + # Sort entities by offset (starting position), with longer entities first for equal offsets + sorted_entities = sorted(entities, key=lambda e: (e.offset, -e.length)) + + # Convert text to utf-16 encoding for proper handling utf16_text = text.encode("utf-16-le") - html_text = "" - - def func(upd_text, subst_type=None, url=None, user=None, custom_emoji_id=None, language=None): - upd_text = upd_text.decode("utf-16-le") - if subst_type == "text_mention": - subst_type = "text_link" - url = "tg://user?id={0}".format(user.id) - # elif subst_type == "mention": - # url = "https://t.me/{0}".format(upd_text[1:]) - upd_text = upd_text.replace("&", "&").replace("<", "<").replace(">", ">") - if not subst_type or not _subs.get(subst_type): - return upd_text - subs = _subs.get(subst_type) - if subst_type == "custom_emoji": - return subs.format(text=upd_text, custom_emoji_id=custom_emoji_id) - elif (subst_type == "pre") and language: - return "
{1}
".format(language, upd_text) - return subs.format(text=upd_text, url=url) - - offset = 0 - start_index = 0 - end_index = 0 - for entity in entities: - if entity.offset > offset: - # when the offset is not 0: for example, a __b__ - # we need to add the text before the entity to the html_text - html_text += func(utf16_text[offset * 2: entity.offset * 2]) - offset = entity.offset - - new_string = func(utf16_text[offset * 2: (offset + entity.length) * 2], subst_type=entity.type, - url=entity.url, user=entity.user, custom_emoji_id=entity.custom_emoji_id, - language=entity.language) - start_index = len(html_text) - html_text += new_string - offset += entity.length - end_index = len(html_text) - elif entity.offset == offset: - new_string = func(utf16_text[offset * 2: (offset + entity.length) * 2], subst_type=entity.type, - url=entity.url, user=entity.user, custom_emoji_id=entity.custom_emoji_id, - language=entity.language) - start_index = len(html_text) - html_text += new_string - end_index = len(html_text) - offset += entity.length + + def escape_entity(text_part): + """Escape HTML special characters in a text part""" + if isinstance(text_part, bytes): + text_part = text_part.decode("utf-16-le") + return text_part.replace("&", "&").replace("<", "<").replace(">", ">") + + def format_entity(entity, content): + """Apply entity formatting to the content""" + entity_type = entity.type + + # Handle different entity types + if entity_type == "text_mention" and hasattr(entity, 'user'): + return f"{content}" + # elif entity_type == "mention": # No need to do this, @username works fine + # username = content[1:] # Remove @ symbol + # return f"{content}" + elif entity_type == "text_link" and hasattr(entity, 'url'): + return f"{content}" + elif entity_type == "custom_emoji" and hasattr(entity, 'custom_emoji_id'): + return f"{content}" + elif entity_type == "pre" and hasattr(entity, 'language') and entity.language: + return f"
{content}
" + elif entity_type in _subs_c: + template = _subs_c[entity_type] + return template.format(text=content) + + # If no matching entity type, return text as is + return content + + def process_entities(byte_text, entity_list, start_pos=0, end_pos=None): + if end_pos is None: + end_pos = len(byte_text) + + if not entity_list or start_pos >= end_pos: + return escape_entity(byte_text[start_pos:end_pos]) + + current_entity = entity_list[0] + current_start = current_entity.offset * 2 + current_end = current_start + current_entity.length * 2 + + if current_end <= start_pos or current_start >= end_pos: + return escape_entity(byte_text[start_pos:end_pos]) + + result = [] + + if current_start > start_pos: + result.append(escape_entity(byte_text[start_pos:current_start])) + + nested_entities = [] + remaining_entities = [] + + for entity in entity_list[1:]: + entity_start = entity.offset * 2 + # entity_end = entity_start + entity.length * 2 + + if current_start <= entity_start < current_end: + nested_entities.append(entity) + else: + remaining_entities.append(entity) + + if nested_entities: + inner_content = process_entities( + byte_text, + nested_entities, + current_start, + current_end + ) else: - # Here we are processing nested entities. - # We shouldn't update offset, because they are the same as entity before. - # And, here we are replacing previous string with a new html-rendered text(previous string is already html-rendered, - # And we don't change it). - entity_string = html_text[start_index: end_index].encode("utf-16-le") - formatted_string = func(entity_string, subst_type=entity.type, url=entity.url, user=entity.user, - custom_emoji_id=entity.custom_emoji_id, - language=entity.language). \ - replace("&", "&").replace("<", "<").replace(">", ">") - html_text = html_text[:start_index] + formatted_string + html_text[end_index:] - end_index = len(html_text) + inner_content = escape_entity(byte_text[current_start:current_end]) - if offset * 2 < len(utf16_text): - html_text += func(utf16_text[offset * 2:]) + result.append(format_entity(current_entity, inner_content)) - return html_text + if current_end < end_pos and remaining_entities: + result.append(process_entities( + byte_text, + remaining_entities, + current_end, + end_pos + )) + elif current_end < end_pos: + result.append(escape_entity(byte_text[current_end:end_pos])) + + return "".join(result) + + html_result = process_entities(utf16_text, sorted_entities) + + return html_result #region DeepSeek vibecoding here @@ -609,552 +604,3 @@ def create_html_tag(self, entity, content: str) -> str: # entity: MessageEntity format_args["text"] = '{}'.format(entity.language, format_args["text"]) return template.format(**format_args) - -def apply_html_entities_ds(text: str, entities: Optional[List], # entities: Optional[List[MessageEntity]] - custom_subs: Optional[Dict[str, str]] = None) -> str: - """ - Parse text message to HTML code according to message entities. - Properly handles UTF-16 code units for offsets and nested entities. - - Args: - text: Plain text message - entities: List of MessageEntity objects - custom_subs: Optional mapping of entity types to custom HTML substitutions/templates. - - Returns: - HTML formatted string - """ - if not text: - return text - elif not entities: - return text.replace("&", "&").replace("<", "<").replace(">", ">") - - processor = EntityProcessor(text, custom_subs=custom_subs) - - # Sort entities by their position in the text - # For proper nesting handling, we need to process from the end - sorted_entities = sorted(entities, key=lambda e: e.offset, reverse=True) - - # Build a tree structure of entities - # First, convert UTF-16 offsets to character indices for easier processing - entity_ranges = [] - for entity in sorted_entities: - if not processor.check_entity_exists(entity.type): - continue - - start_char = processor.utf16_to_char_index(entity.offset) - end_char = processor.utf16_to_char_index(entity.offset + entity.length) - - entity_ranges.append({ - 'entity': entity, - 'start': start_char, - 'end': end_char, - 'type': entity.type, - }) - - # Sort by start position (ascending) and then by length (descending) - # This ensures parent entities come before children - entity_ranges.sort(key=lambda x: (x['start'], -x['end'])) - - # Build the HTML recursively - def process_range(start_idx: int, end_idx: int, entities_in_range: List[dict]) -> str: - """ - Recursively process a text range with its entities. - """ - if not entities_in_range: - return text[start_idx:end_idx] - - # Group entities by their start position - result_parts = [] - current_pos = start_idx - - # Sort entities by their start position - entities_in_range.sort(key=lambda x: x['start']) - - i = 0 - while i < len(entities_in_range): - cur_entity = entities_in_range[i] - - # Add text before this entity - if cur_entity['start'] > current_pos: - result_parts.append(text[current_pos:cur_entity['start']]) - - # Find all entities that start at the same position or are nested within - nested_entities = [] - j = i - while j < len(entities_in_range) and entities_in_range[j]['start'] < cur_entity['end']: - if entities_in_range[j]['start'] >= cur_entity['start']: - nested_entities.append(entities_in_range[j]) - j += 1 - - # Filter entities that are actually within this entity's range - nested_entities = [e for e in nested_entities if - e['start'] >= cur_entity['start'] and e['end'] <= cur_entity['end']] - - # Process the content of this entity (including nested entities) - content = process_range(cur_entity['start'], cur_entity['end'], - [e for e in nested_entities if e != cur_entity]) - - # Apply this entity's HTML tag - html_content = processor.create_html_tag(cur_entity['entity'], content) - result_parts.append(html_content) - - # Move current position to the end of this entity - current_pos = cur_entity['end'] - i = j - - # Add remaining text - if current_pos < end_idx: - result_parts.append(text[current_pos:end_idx]) - - return ''.join(result_parts) - - # Process the entire text - return process_range(0, len(text), entity_ranges) -#endregion - -#region Gemini vibecoding here -def apply_html_entities_gm( - text: str, - entities: Optional[List], # entities: Optional[List[MessageEntity]] - custom_subs: Optional[Dict[str, str]] = None -) -> str: - # if not entities: - # return html.escape(text) - if not text: - return text - elif not entities: - return text.replace("&", "&").replace("<", "<").replace(">", ">") - - # --- Step 1: Map UTF-16 offsets to Python String Indices --- - # Telegram API uses UTF-16 code units for offsets/length. - # Python strings are indexed by Unicode code points. - # We need to map: utf16_offset -> python_string_index - - # Identify all 'significant' UTF-16 boundaries we care about (start and end of every entity) - boundaries = set() - for e in entities: - boundaries.add(e.offset) - boundaries.add(e.offset + e.length) - - # Sort them to iterate through the text linearly - sorted_boundaries = sorted(list(boundaries)) - boundary_map = {} # Maps utf16_offset -> python_index - - current_utf16_len = 0 - boundary_idx = 0 - - # Iterate over the string code point by code point - for py_index, char in enumerate(text): - # If we reached a boundary, record the mapping - while boundary_idx < len(sorted_boundaries) and current_utf16_len == sorted_boundaries[boundary_idx]: - boundary_map[sorted_boundaries[boundary_idx]] = py_index - boundary_idx += 1 - - if boundary_idx >= len(sorted_boundaries): - break - - # Advance UTF-16 counter - # BMP characters (<= 0xFFFF) take 1 unit. Non-BMP (surrogates) take 2 units. - if ord(char) > 0xFFFF: - current_utf16_len += 2 - else: - current_utf16_len += 1 - - # Handle boundaries that fall exactly at the end of the string - while boundary_idx < len(sorted_boundaries) and current_utf16_len == sorted_boundaries[boundary_idx]: - boundary_map[sorted_boundaries[boundary_idx]] = len(text) - boundary_idx += 1 - - # --- Step 2: Create Markers --- - # We transform entities into "Insert Start Tag" and "Insert End Tag" markers. - markers = [] - - for e in entities: - if e.offset not in boundary_map or (e.offset + e.length) not in boundary_map: - continue # Skip invalid entities - - start_py = boundary_map[e.offset] - end_py = boundary_map[e.offset + e.length] - - # Structure: (Index, Type, Priority, Entity) - # Type: 1 = Start Tag, 0 = End Tag. - # Priority: Used to ensure correct nesting (Outer tags wrap Inner tags). - # - For Start Tags (1): Larger length = Higher priority (Process earlier). - # We use negative length so 'smaller' number comes first in ASC sort. - # - For End Tags (0): Smaller length = Higher priority (Process earlier). - - # Start Marker - markers.append((start_py, 1, -e.length, e)) - - # End Marker - markers.append((end_py, 0, e.length, e)) - - # --- Step 3: Sort Markers --- - # Primary Key: Index (asc) - # Secondary Key: Type (End tags (0) before Start tags (1) at same index) -> This fixes vs - # Tertiary Key: Priority (Length based nesting) - - # FIX: We use a lambda key to avoid comparing the 'e' (MessageEntity) object directly - markers.sort(key=lambda x: (x[0], x[1], x[2])) - - # --- Step 4: Build HTML --- - result = [] - text_ptr = 0 - stack = [] # To track currently open entities - - for index, tag_type, _, entity in markers: - # 1. Append text leading up to this marker - if index > text_ptr: - result.append(html.escape(text[text_ptr:index])) - text_ptr = index - - # 2. Get the HTML tag representation - tag = get_html_tag(entity, custom_subs) - if not tag: - continue - - if tag_type == 1: # START TAG - result.append(tag['open']) - stack.append(entity) - - else: # END TAG - # If stack is empty (shouldn't happen in valid data), ignore - if not stack: - continue - - # If the entity to close is at the top of the stack, close it normally - if stack[-1] == entity: - result.append(tag['close']) - stack.pop() - else: - # INTERSECTING ENTITIES DETECTED - # We need to close everything down to our entity, then reopen them - if entity in stack: - temp_stack = [] - - # Pop and close until we find the target - while stack[-1] != entity: - top_entity = stack.pop() - top_tag = get_html_tag(top_entity, custom_subs) - if top_tag: - result.append(top_tag['close']) - temp_stack.append(top_entity) - - # Close the target entity - result.append(tag['close']) - stack.pop() - - # Re-open the temporarily closed entities (in reverse order to preserve nesting) - for popped_entity in reversed(temp_stack): - p_tag = get_html_tag(popped_entity, custom_subs) - if p_tag: - result.append(p_tag['open']) - stack.append(popped_entity) - - # Append remaining text - if text_ptr < len(text): - result.append(html.escape(text[text_ptr:])) - - return "".join(result) - - -def get_html_tag(entity, custom_subs: Optional[Dict[str, str]]) -> Optional[Dict[str, str]]: # entity: MessageEntity - """Helper to get open/close tags based on entity type.""" - - # Check custom subs first (basic implementation: if type in dict, return it as open tag) - # Note: The prompt implies full substitutions, but simple key-value usually implies 'open' tag or full format. - # Given the complexity of closing tags, we stick to the Prompt's Rules for known types. - - t = entity.type - if t == "bold": - return {'open': "", 'close': ""} - elif t == "italic": - return {'open': "", 'close': ""} - elif t == "underline": - return {'open': "", 'close': ""} - elif t == "strikethrough": - return {'open': "", 'close': ""} - elif t == "spoiler": - return {'open': '', 'close': ""} - elif t == "code": - return {'open': "", 'close': ""} - elif (t == "pre") and entity.language: - return {'open': f'
', 'close': "
"} - elif t == "pre": - return {'open': "
", 'close': "
"} - elif t == "blockquote": - return {'open': "
", 'close': "
"} - elif t == "expandable_blockquote": - return {'open': "
", 'close': "
"} - elif t == "text_link": - return {'open': f'', 'close': ""} - elif t == "text_mention": - return {'open': f'', 'close': ""} - elif t == "custom_emoji": - return {'open': f'', 'close': ""} - elif custom_subs and (t in custom_subs): - # Support custom substitutions by splitting the template around the {text} placeholder - template = custom_subs[t] - if "{text}" in template: - open_part, close_part = template.split("{text}", 1) - else: - # If no {text} placeholder is present, treat the entire template as the opening part - open_part, close_part = template, "" - return {'open': open_part, 'close': close_part} - - return None -#endregion - -#region ChatGPT vibecoding here -ENTITY_TEMPLATES_CG = { - "bold": "{text}", - "italic": "{text}", - "pre": "
{text}
", - "code": "{text}", - "text_link": "{text}", - "strikethrough": "{text}", - "underline": "{text}", - "spoiler": "{text}", - "custom_emoji": "{text}", - "blockquote": "
{text}
", - "expandable_blockquote": "
{text}
", -} - -def utf16_index_map(s: str) -> List[int]: - """ - Map UTF-16 code unit index -> Python string index. - Result length = utf16_len + 1 - """ - mapping = [0] - u16 = 0 - for i, ch in enumerate(s): - code = ord(ch) - u16 += 2 if code > 0xFFFF else 1 - while len(mapping) <= u16: - mapping.append(i + 1) - return mapping - -def apply_template(entity, inner: str, custom_subs: Optional[Dict[str, str]]) -> str: - t = entity.type - if t in ENTITY_TEMPLATES_CG: - tpl = ENTITY_TEMPLATES_CG[t] - elif custom_subs and t in custom_subs: - tpl = custom_subs[t] - else: - return inner - - data = {"text": inner} - - if t == "text_link": - data["url"] = getattr(entity, "url", "") - elif t == "text_mention": - data["url"] = f"tg://user?id={getattr(entity, 'user', {}).id if getattr(entity, 'user', None) else ''}" - elif t == "custom_emoji": - data["custom_emoji_id"] = getattr(entity, "custom_emoji_id", "") - elif (t == "pre") and getattr(entity, "language", None): - data["text"] = f'{inner}' - - return tpl.format(**data) - -def build_tree(entities: List, mapping: List[int]): - nodes = [] - - for e in entities: - start16 = e.offset - end16 = e.offset + e.length - - start = mapping[start16] - end = mapping[end16] - - nodes.append({ - "entity": e, - "start": start, - "end": end, - "children": [] - }) - - nodes.sort(key=lambda node: (node["start"], -node["end"])) - - stack = [] - roots = [] - - for n in nodes: - while stack and n["start"] >= stack[-1]["end"]: - stack.pop() - - if stack: - stack[-1]["children"].append(n) - else: - roots.append(n) - - stack.append(n) - - return roots - -def render(text: str, nodes, custom_subs): - result = [] - pos = 0 - - for n in nodes: - result.append(text[pos:n["start"]]) - - inner = render( - text[n["start"]:n["end"]], - shift_nodes(n["children"], n["start"]), - custom_subs - ) - - wrapped = apply_template(n["entity"], inner, custom_subs) - result.append(wrapped) - - pos = n["end"] - - result.append(text[pos:]) - return "".join(result) - -def shift_nodes(nodes, shift): - out = [] - for n in nodes: - out.append({ - "entity": n["entity"], - "start": n["start"] - shift, - "end": n["end"] - shift, - "children": shift_nodes(n["children"], shift), - }) - return out - -def apply_html_entities_cg( - text: str, - entities: Optional[List], - custom_subs: Optional[Dict[str, str]] -) -> str: - if not text: - return text - elif not entities: - return text.replace("&", "&").replace("<", "<").replace(">", ">") - - mapping = utf16_index_map(text) - tree = build_tree(entities, mapping) - return render(text, tree, custom_subs) -#endregion - -def apply_html_entities_coder(text: str, entities=None, custom_subs=None) -> str: - """ - Apply HTML formatting to text based on provided entities. - Handles nested and overlapping entities correctly. - """ - if not entities: - return text.replace("&", "&").replace("<", "<").replace(">", ">") - - _subs_c = { - "bold": "{text}", - "italic": "{text}", - "pre": "
{text}
", - "code": "{text}", - "text_link": "{text}", - "strikethrough": "{text}", - "underline": "{text}", - "spoiler": "{text}", - "custom_emoji": "{text}", - "blockquote": "
{text}
", - "expandable_blockquote": "
{text}
", - } - - if custom_subs: - for key, value in custom_subs.items(): - _subs_c[key] = value - - # Sort entities by offset (starting position), with longer entities first for equal offsets - sorted_entities = sorted(entities, key=lambda e: (e.offset, -e.length)) - - # Convert text to utf-16 encoding for proper handling - utf16_text = text.encode("utf-16-le") - - def escape_entity(text_part): - """Escape HTML special characters in a text part""" - if isinstance(text_part, bytes): - text_part = text_part.decode("utf-16-le") - return text_part.replace("&", "&").replace("<", "<").replace(">", ">") - - def format_entity(entity, content): - """Apply entity formatting to the content""" - entity_type = entity.type - - # Handle different entity types - if entity_type == "text_mention" and hasattr(entity, 'user'): - return f"{content}" - # elif entity_type == "mention": # No need to do this, @username works fine - # username = content[1:] # Remove @ symbol - # return f"{content}" - elif entity_type == "text_link" and hasattr(entity, 'url'): - return f"{content}" - elif entity_type == "custom_emoji" and hasattr(entity, 'custom_emoji_id'): - return f"{content}" - elif entity_type == "pre" and hasattr(entity, 'language') and entity.language: - return f"
{content}
" - elif entity_type in _subs_c: - template = _subs_c[entity_type] - return template.format(text=content) - - # If no matching entity type, return text as is - return content - - def process_entities(byte_text, entity_list, start_pos=0, end_pos=None): - if end_pos is None: - end_pos = len(byte_text) - - if not entity_list or start_pos >= end_pos: - return escape_entity(byte_text[start_pos:end_pos]) - - current_entity = entity_list[0] - current_start = current_entity.offset * 2 - current_end = current_start + current_entity.length * 2 - - if current_end <= start_pos or current_start >= end_pos: - return escape_entity(byte_text[start_pos:end_pos]) - - result = [] - - if current_start > start_pos: - result.append(escape_entity(byte_text[start_pos:current_start])) - - nested_entities = [] - remaining_entities = [] - - for entity in entity_list[1:]: - entity_start = entity.offset * 2 - # entity_end = entity_start + entity.length * 2 - - if current_start <= entity_start < current_end: - nested_entities.append(entity) - else: - remaining_entities.append(entity) - - if nested_entities: - inner_content = process_entities( - byte_text, - nested_entities, - current_start, - current_end - ) - else: - inner_content = escape_entity(byte_text[current_start:current_end]) - - result.append(format_entity(current_entity, inner_content)) - - if current_end < end_pos and remaining_entities: - result.append(process_entities( - byte_text, - remaining_entities, - current_end, - end_pos - )) - elif current_end < end_pos: - result.append(escape_entity(byte_text[current_end:end_pos])) - - return "".join(result) - - html_result = process_entities(utf16_text, sorted_entities) - - return html_result From bb9ae6909c55059cde5995233f60489dd2adafa4 Mon Sep 17 00:00:00 2001 From: Badiboy Date: Sat, 21 Feb 2026 12:33:08 +0300 Subject: [PATCH 4/8] Fix typing --- telebot/formatting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/telebot/formatting.py b/telebot/formatting.py index 95e81db0f..cef2584ab 100644 --- a/telebot/formatting.py +++ b/telebot/formatting.py @@ -373,7 +373,7 @@ def hcite(content: str, escape: Optional[bool] = True, expandable: Optional[bool ) -def apply_html_entities(text: str, entities=None, custom_subs=None) -> str: +def apply_html_entities(text: str, entities: Optional[List]=None, custom_subs: Optional[Dict[str, str]]=None) -> str: """ Apply HTML formatting to text based on provided entities. Handles nested and overlapping entities correctly. From b6620c3f23918cfb6403a7704140b536ad1961f0 Mon Sep 17 00:00:00 2001 From: Badiboy Date: Sat, 21 Feb 2026 12:36:05 +0300 Subject: [PATCH 5/8] Fix naming --- telebot/formatting.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/telebot/formatting.py b/telebot/formatting.py index cef2584ab..717858b36 100644 --- a/telebot/formatting.py +++ b/telebot/formatting.py @@ -381,7 +381,7 @@ def apply_html_entities(text: str, entities: Optional[List]=None, custom_subs: O if not entities: return text.replace("&", "&").replace("<", "<").replace(">", ">") - _subs_c = { + _subs = { "bold": "{text}", "italic": "{text}", "pre": "
{text}
", @@ -397,7 +397,7 @@ def apply_html_entities(text: str, entities: Optional[List]=None, custom_subs: O if custom_subs: for key, value in custom_subs.items(): - _subs_c[key] = value + _subs[key] = value # Sort entities by offset (starting position), with longer entities first for equal offsets sorted_entities = sorted(entities, key=lambda e: (e.offset, -e.length)) @@ -427,8 +427,8 @@ def format_entity(entity, content): return f"{content}" elif entity_type == "pre" and hasattr(entity, 'language') and entity.language: return f"
{content}
" - elif entity_type in _subs_c: - template = _subs_c[entity_type] + elif entity_type in _subs: + template = _subs[entity_type] return template.format(text=content) # If no matching entity type, return text as is From b066c2da8cd287d16de1b0e4fe6cdf66269f8444 Mon Sep 17 00:00:00 2001 From: Badiboy Date: Sat, 21 Feb 2026 12:52:13 +0300 Subject: [PATCH 6/8] Fix entities test --- tests/test_types.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_types.py b/tests/test_types.py index 630ab658c..5a8afd6e9 100644 --- a/tests/test_types.py +++ b/tests/test_types.py @@ -278,7 +278,7 @@ def test_message_entity(): sample_string_1 = r'{"update_id":934522126,"message":{"message_id":1374510,"from":{"id":927266710,"is_bot":false,"first_name":">_run","username":"coder2020","language_code":"en","is_premium":true},"chat":{"id":927266710,"first_name":">_run","username":"coder2020","type":"private"},"date":1682177590,"text":"b b b","entities":[{"offset":0,"length":2,"type":"bold"},{"offset":0,"length":1,"type":"italic"},{"offset":2,"length":2,"type":"bold"},{"offset":2,"length":1,"type":"italic"},{"offset":4,"length":1,"type":"bold"},{"offset":4,"length":1,"type":"italic"}]}}' update = types.Update.de_json(sample_string_1) message: types.Message = update.message - assert message.html_text == "b b b" + assert message.html_text == "b b b" sample_string_2 = r'{"update_id":934522166,"message":{"message_id":1374526,"from":{"id":927266710,"is_bot":false,"first_name":">_run","username":"coder2020","language_code":"en","is_premium":true},"chat":{"id":927266710,"first_name":">_run","username":"coder2020","type":"private"},"date":1682179716,"text":"b b b","entities":[{"offset":0,"length":1,"type":"bold"},{"offset":2,"length":1,"type":"bold"},{"offset":4,"length":1,"type":"italic"}]}}' message_2 = types.Update.de_json(sample_string_2).message From f8f081033ed95b3a449b5ee29107bffb1f17cb5f Mon Sep 17 00:00:00 2001 From: Badiboy Date: Sat, 21 Feb 2026 13:09:33 +0300 Subject: [PATCH 7/8] Fix entities tests --- tests/test_types.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/test_types.py b/tests/test_types.py index 5a8afd6e9..04e8b3ac8 100644 --- a/tests/test_types.py +++ b/tests/test_types.py @@ -288,12 +288,16 @@ def test_message_entity(): sample_string_3 = r'{"update_id":934522172,"message":{"message_id":1374530,"from":{"id":927266710,"is_bot":false,"first_name":">_run","username":"coder2020","language_code":"en","is_premium":true},"chat":{"id":927266710,"first_name":">_run","username":"coder2020","type":"private"},"date":1682179968,"text":"This is a bold text with a nested italic and bold text.","entities":[{"offset":10,"length":4,"type":"bold"},{"offset":27,"length":7,"type":"italic"},{"offset":34,"length":15,"type":"bold"},{"offset":34,"length":15,"type":"italic"}]}}' message_3 = types.Update.de_json(sample_string_3).message - assert message_3.html_text == "This is a bold text with a nested italic and bold text." + assert \ + (message_3.html_text == "This is a bold text with a nested italic and bold text.") or \ + (message_3.html_text == "This is a bold text with a nested italic and bold text.") sample_string_4 = r'{"update_id":934522437,"message":{"message_id":1374619,"from":{"id":927266710,"is_bot":false,"first_name":">_run","username":"coder2020","language_code":"en","is_premium":true},"chat":{"id":927266710,"first_name":">_run","username":"coder2020","type":"private"},"date":1682189507,"forward_from":{"id":927266710,"is_bot":false,"first_name":">_run","username":"coder2020","language_code":"en","is_premium":true},"forward_date":1682189124,"text":"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa😋😋","entities":[{"offset":0,"length":76,"type":"bold"},{"offset":0,"length":76,"type":"italic"},{"offset":0,"length":76,"type":"underline"},{"offset":0,"length":76,"type":"strikethrough"},{"offset":76,"length":2,"type":"custom_emoji","custom_emoji_id":"5456188142006575553"},{"offset":78,"length":2,"type":"custom_emoji","custom_emoji_id":"5456188142006575553"}]}}' message_4 = types.Update.de_json(sample_string_4).message - assert message_4.html_text == 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa😋😋' + assert \ + (message_4.html_text == 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa😋😋') or \ + (message_4.html_text == 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa😋😋') sample_string_5 = r'{"update_id":934522166,"message":{"message_id":1374526,"from":{"id":927266710,"is_bot":false,"first_name":">_run","username":"coder2020","language_code":"en","is_premium":true},"chat":{"id":927266710,"first_name":">_run","username":"coder2020","type":"private"},"date":1682179716,"text":"b b i","entities":[{"offset":0,"length":1,"type":"bold"}]}}' From 5a47c6c3b13a65f4868cfe5dc8709c827550407b Mon Sep 17 00:00:00 2001 From: Badiboy Date: Sat, 21 Feb 2026 13:18:32 +0300 Subject: [PATCH 8/8] Class was not removed --- telebot/formatting.py | 112 ------------------------------------------ 1 file changed, 112 deletions(-) diff --git a/telebot/formatting.py b/telebot/formatting.py index 717858b36..ebbcb77be 100644 --- a/telebot/formatting.py +++ b/telebot/formatting.py @@ -492,115 +492,3 @@ def process_entities(byte_text, entity_list, start_pos=0, end_pos=None): html_result = process_entities(utf16_text, sorted_entities) return html_result - - -#region DeepSeek vibecoding here -class EntityProcessor: - """ - Handles parsing of text with message entities to HTML. - """ - - # Entity type to HTML template mapping - ENTITY_TEMPLATES = { - "bold": "{text}", - "italic": "{text}", - "pre": "
{text}
", - "code": "{text}", - "text_link": "{text}", - "strikethrough": "{text}", - "underline": "{text}", - "spoiler": "{text}", - "custom_emoji": "{text}", - "blockquote": "
{text}
", - "expandable_blockquote": "
{text}
", - } - - def __init__(self, text: str, custom_subs: Optional[Dict[str, str]] = None): - self.text = text - self.utf16_mapping = self.utf16_code_units_to_indices(text) - self.total_utf16_units = len(self.utf16_mapping) - self.custom_subs = custom_subs - - def check_entity_exists(self, entity_type: str) -> bool: - """ - Check if an entity type has a defined HTML template, considering custom substitutions. - """ - return (entity_type in self.ENTITY_TEMPLATES) or (self.custom_subs and (entity_type in self.custom_subs)) - - def get_entity_template(self, entity_type: str, default: Optional[str] = None) -> Optional[str]: - """ - Get the HTML template for a given entity type, considering custom substitutions. - """ - if entity_type in self.ENTITY_TEMPLATES: - return self.ENTITY_TEMPLATES[entity_type] - elif self.custom_subs and (entity_type in self.custom_subs): - return self.custom_subs[entity_type] - else: - return default - - @staticmethod - def utf16_code_units_to_indices(text: str) -> List[int]: - """ - Convert UTF-16 code unit positions to Python string indices. - - Returns: - code_unit_to_char_idx: Mapping from UTF-16 code unit position to character index - """ - code_unit_to_char_idx = [] - - code_unit_pos = 0 - for char_idx, char in enumerate(text): - code_point = ord(char) - # Characters outside BMP (U+10000 to U+10FFFF) use 2 UTF-16 code units - if code_point >= 0x10000: - code_units = 2 - else: - code_units = 1 - - # Map this code unit position to character index - for _ in range(code_units): - code_unit_to_char_idx.append(char_idx) - - code_unit_pos += code_units - - return code_unit_to_char_idx - - def utf16_to_char_index(self, utf16_pos: int) -> int: - """ - Convert UTF-16 code unit position to character index. - """ - if utf16_pos >= len(self.utf16_mapping): - return len(self.text) - return self.utf16_mapping[utf16_pos] - - def get_entity_text(self, entity) -> str: # entity: MessageEntity - """ - Extract the text for an entity using UTF-16 code unit offsets. - """ - start_char = self.utf16_to_char_index(entity.offset) - end_char = self.utf16_to_char_index(entity.offset + entity.length) - return self.text[start_char:end_char] - - def create_html_tag(self, entity, content: str) -> str: # entity: MessageEntity - """ - Create HTML tag for an entity with the given content. - """ - entity_type = entity.type - - template = self.get_entity_template(entity_type) - if not template: - return content - - # Prepare format arguments - format_args = {"text": content} - if entity_type == "text_mention": - template = self.get_entity_template("text_link") - format_args["url"] = "tg://user?id={0}".format(entity.user.id) - elif entity_type == "text_link": - format_args["url"] = escape_html(entity.url or "") - elif entity_type == "custom_emoji": - format_args["custom_emoji_id"] = entity.custom_emoji_id or "" - elif entity_type == "pre" and entity.language: - format_args["text"] = '{}'.format(entity.language, format_args["text"]) - - return template.format(**format_args)