diff --git a/bs2json/__init__.py b/bs2json/__init__.py index 77a6523..0f41633 100644 --- a/bs2json/__init__.py +++ b/bs2json/__init__.py @@ -13,6 +13,7 @@ def to_json( tag: element.Tag, include_comments: bool=True, strip: bool=True, + keep_order: bool=False, attr_name: str="attrs", text_name: str="text", comment_name: str="comment", @@ -27,7 +28,8 @@ def to_json( kwargs = dict( **name_kwargs, include_comments=include_comments, - strip=strip + strip=strip, + keep_order=keep_order ) diff --git a/bs2json/bs2json.py b/bs2json/bs2json.py index e467d4d..1681e38 100644 --- a/bs2json/bs2json.py +++ b/bs2json/bs2json.py @@ -14,6 +14,7 @@ class BS2Json: """ include_comments = True strip = True + keep_order = False __labels: Dict = {} soup: BeautifulSoup = None last_obj: Dict = {} @@ -24,6 +25,7 @@ def __init__(self, *, include_comments: Union[bool, str]=True, strip: bool=True, + keep_order: bool=False, **kwargs ) -> NoReturn: """Initialize the instance of bs2json class. @@ -34,6 +36,8 @@ def __init__(self, include_comments (bool, optional): Whether to include comments in the JSON representation. Defaults to True. strip (bool): Whether to remove whitespaces from the start and end of text. + keep_order (bool): Whether to preserve the original order of elements instead of + grouping them by type. Defaults to False. **kwargs: Keyword arguments for initializing BeautifulSoup. """ @@ -56,6 +60,7 @@ def __init__(self, self.soup = soup self.include_comments = include_comments self.strip = strip + self.keep_order = keep_order self.labels(attrs=attr_name, text=text_name, comment=comment_name) @@ -262,7 +267,13 @@ def __tag(self, element): if isinstance(value, dict): json[element.name].update(value) elif isinstance(value, list): - if element.attrs: + # When keep_order=True, simplify single-text elements + if (self.keep_order and len(value) == 1 and + isinstance(value[0], dict) and len(value[0]) == 1 and + text_name in value[0] and not element.attrs): + # Single text content without attributes: return just the text + return value[0][text_name] + elif element.attrs: value.append(json[element.name]) json[element.name] = value else: @@ -298,7 +309,9 @@ def to_json(self, comment_name = self.__labels['comment'] if isinstance(element,Element.Tag): json[element.name] = self.__tag(element) - if json[element.name].get(text_name) and len(json[element.name]) == 1: + if (isinstance(json[element.name], dict) and + json[element.name].get(text_name) and + len(json[element.name]) == 1): return json[element.name][text_name] json = json[element.name] elif isinstance(element, Element.Comment) and self.include_comments: @@ -314,14 +327,33 @@ def to_json(self, json['doctype'] = str(element) json.update(self.to_json(element.next_element)) elif isinstance(element, (Iterator, Iterable)): - for elem in element: - name = self.__get_name(elem) - value = self.to_json(elem) or None - if not value and name == text_name: - continue - if name in json: - json[name].append(value) - else: - json[name] = [value] - self.__fix(json) + if self.keep_order: + # Return a list preserving order instead of grouping by type + ordered_list = [] + for elem in element: + name = self.__get_name(elem) + value = self.to_json(elem) or None + if not value and name == text_name: + continue + + # Simplify single-text elements like {"h3": [{"text": "chapter 1"}]} to {"h3": "chapter 1"} + if (isinstance(value, list) and len(value) == 1 and + isinstance(value[0], dict) and len(value[0]) == 1 and + text_name in value[0]): + value = value[0][text_name] + + ordered_list.append({name: value}) + return ordered_list + else: + # Original behavior: group by type + for elem in element: + name = self.__get_name(elem) + value = self.to_json(elem) or None + if not value and name == text_name: + continue + if name in json: + json[name].append(value) + else: + json[name] = [value] + self.__fix(json) return json diff --git a/tests/tests.py b/tests/tests.py index 41645ec..3e96dba 100644 --- a/tests/tests.py +++ b/tests/tests.py @@ -72,5 +72,42 @@ def test_prettify(self): result = out['stdout'] self.assertEqual(result, expected_4) + def test_keep_order_feature(self): + """Test the keep_order feature preserves element order.""" + html_ordered = '''
+paragraph
+