From 6518255e3551938ca6c224a65790f5b9a3d31ee0 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 3 Aug 2025 22:17:33 +0000 Subject: [PATCH 1/3] Initial plan From c3af79be74da19befcbc682e9e40f4aecdedb16d Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 3 Aug 2025 22:32:37 +0000 Subject: [PATCH 2/3] Implement keep_order feature - preserve element order instead of grouping by type Co-authored-by: MrDebugger <25988388+MrDebugger@users.noreply.github.com> --- bs2json/__init__.py | 4 +++- bs2json/bs2json.py | 56 +++++++++++++++++++++++++++++++++++---------- 2 files changed, 47 insertions(+), 13 deletions(-) diff --git a/bs2json/__init__.py b/bs2json/__init__.py index 77a6523..0f41633 100644 --- a/bs2json/__init__.py +++ b/bs2json/__init__.py @@ -13,6 +13,7 @@ def to_json( tag: element.Tag, include_comments: bool=True, strip: bool=True, + keep_order: bool=False, attr_name: str="attrs", text_name: str="text", comment_name: str="comment", @@ -27,7 +28,8 @@ def to_json( kwargs = dict( **name_kwargs, include_comments=include_comments, - strip=strip + strip=strip, + keep_order=keep_order ) diff --git a/bs2json/bs2json.py b/bs2json/bs2json.py index e467d4d..1681e38 100644 --- a/bs2json/bs2json.py +++ b/bs2json/bs2json.py @@ -14,6 +14,7 @@ class BS2Json: """ include_comments = True strip = True + keep_order = False __labels: Dict = {} soup: BeautifulSoup = None last_obj: Dict = {} @@ -24,6 +25,7 @@ def __init__(self, *, include_comments: Union[bool, str]=True, strip: bool=True, + keep_order: bool=False, **kwargs ) -> NoReturn: """Initialize the instance of bs2json class. @@ -34,6 +36,8 @@ def __init__(self, include_comments (bool, optional): Whether to include comments in the JSON representation. Defaults to True. strip (bool): Whether to remove whitespaces from the start and end of text. + keep_order (bool): Whether to preserve the original order of elements instead of + grouping them by type. Defaults to False. **kwargs: Keyword arguments for initializing BeautifulSoup. """ @@ -56,6 +60,7 @@ def __init__(self, self.soup = soup self.include_comments = include_comments self.strip = strip + self.keep_order = keep_order self.labels(attrs=attr_name, text=text_name, comment=comment_name) @@ -262,7 +267,13 @@ def __tag(self, element): if isinstance(value, dict): json[element.name].update(value) elif isinstance(value, list): - if element.attrs: + # When keep_order=True, simplify single-text elements + if (self.keep_order and len(value) == 1 and + isinstance(value[0], dict) and len(value[0]) == 1 and + text_name in value[0] and not element.attrs): + # Single text content without attributes: return just the text + return value[0][text_name] + elif element.attrs: value.append(json[element.name]) json[element.name] = value else: @@ -298,7 +309,9 @@ def to_json(self, comment_name = self.__labels['comment'] if isinstance(element,Element.Tag): json[element.name] = self.__tag(element) - if json[element.name].get(text_name) and len(json[element.name]) == 1: + if (isinstance(json[element.name], dict) and + json[element.name].get(text_name) and + len(json[element.name]) == 1): return json[element.name][text_name] json = json[element.name] elif isinstance(element, Element.Comment) and self.include_comments: @@ -314,14 +327,33 @@ def to_json(self, json['doctype'] = str(element) json.update(self.to_json(element.next_element)) elif isinstance(element, (Iterator, Iterable)): - for elem in element: - name = self.__get_name(elem) - value = self.to_json(elem) or None - if not value and name == text_name: - continue - if name in json: - json[name].append(value) - else: - json[name] = [value] - self.__fix(json) + if self.keep_order: + # Return a list preserving order instead of grouping by type + ordered_list = [] + for elem in element: + name = self.__get_name(elem) + value = self.to_json(elem) or None + if not value and name == text_name: + continue + + # Simplify single-text elements like {"h3": [{"text": "chapter 1"}]} to {"h3": "chapter 1"} + if (isinstance(value, list) and len(value) == 1 and + isinstance(value[0], dict) and len(value[0]) == 1 and + text_name in value[0]): + value = value[0][text_name] + + ordered_list.append({name: value}) + return ordered_list + else: + # Original behavior: group by type + for elem in element: + name = self.__get_name(elem) + value = self.to_json(elem) or None + if not value and name == text_name: + continue + if name in json: + json[name].append(value) + else: + json[name] = [value] + self.__fix(json) return json From de4e9845f22b2fa0fb51991e11a55a9a6f2e4ab3 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Sun, 3 Aug 2025 22:35:09 +0000 Subject: [PATCH 3/3] Add comprehensive tests and complete keep_order feature implementation Co-authored-by: MrDebugger <25988388+MrDebugger@users.noreply.github.com> --- tests/tests.py | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/tests/tests.py b/tests/tests.py index 41645ec..3e96dba 100644 --- a/tests/tests.py +++ b/tests/tests.py @@ -72,5 +72,42 @@ def test_prettify(self): result = out['stdout'] self.assertEqual(result, expected_4) + def test_keep_order_feature(self): + """Test the keep_order feature preserves element order.""" + html_ordered = ''' +

first heading

+

paragraph

+

second heading

+
+''' + + # Test default behavior (grouping) + bs2json_default = BS2Json(html_ordered) + result_default = bs2json_default.convert() + self.assertEqual(result_default['html']['body']['h3'], ['first heading', 'second heading']) + + # Test keep_order behavior (preserving order) + bs2json_ordered = BS2Json(html_ordered, keep_order=True) + result_ordered = bs2json_ordered.convert() + + # Extract body content + body_content = None + for item in result_ordered['html']: + if 'body' in item: + body_content = item['body'] + break + + self.assertIsNotNone(body_content) + self.assertIsInstance(body_content, list) + + # Verify order is preserved: h3, p, h3, hr + expected_order = ['h3', 'p', 'h3', 'hr'] + actual_order = [list(element.keys())[0] for element in body_content] + self.assertEqual(actual_order, expected_order) + + # Verify h3 elements are separate and simplified + self.assertEqual(body_content[0]['h3'], 'first heading') + self.assertEqual(body_content[2]['h3'], 'second heading') + if __name__ == "__main__": unittest.main() \ No newline at end of file