Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion bs2json/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ def to_json(
tag: element.Tag,
include_comments: bool=True,
strip: bool=True,
keep_order: bool=False,
attr_name: str="attrs",
text_name: str="text",
comment_name: str="comment",
Expand All @@ -27,7 +28,8 @@ def to_json(
kwargs = dict(
**name_kwargs,
include_comments=include_comments,
strip=strip
strip=strip,
keep_order=keep_order
)


Expand Down
56 changes: 44 additions & 12 deletions bs2json/bs2json.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ class BS2Json:
"""
include_comments = True
strip = True
keep_order = False
__labels: Dict = {}
soup: BeautifulSoup = None
last_obj: Dict = {}
Expand All @@ -24,6 +25,7 @@ def __init__(self,
*,
include_comments: Union[bool, str]=True,
strip: bool=True,
keep_order: bool=False,
**kwargs
) -> NoReturn:
"""Initialize the instance of bs2json class.
Expand All @@ -34,6 +36,8 @@ def __init__(self,
include_comments (bool, optional): Whether to include comments in the JSON
representation. Defaults to True.
strip (bool): Whether to remove whitespaces from the start and end of text.
keep_order (bool): Whether to preserve the original order of elements instead of
grouping them by type. Defaults to False.
**kwargs: Keyword arguments for initializing BeautifulSoup.
"""

Expand All @@ -56,6 +60,7 @@ def __init__(self,
self.soup = soup
self.include_comments = include_comments
self.strip = strip
self.keep_order = keep_order

self.labels(attrs=attr_name, text=text_name, comment=comment_name)

Expand Down Expand Up @@ -262,7 +267,13 @@ def __tag(self, element):
if isinstance(value, dict):
json[element.name].update(value)
elif isinstance(value, list):
if element.attrs:
# When keep_order=True, simplify single-text elements
if (self.keep_order and len(value) == 1 and
isinstance(value[0], dict) and len(value[0]) == 1 and
text_name in value[0] and not element.attrs):
# Single text content without attributes: return just the text
return value[0][text_name]
elif element.attrs:
value.append(json[element.name])
json[element.name] = value
else:
Expand Down Expand Up @@ -298,7 +309,9 @@ def to_json(self,
comment_name = self.__labels['comment']
if isinstance(element,Element.Tag):
json[element.name] = self.__tag(element)
if json[element.name].get(text_name) and len(json[element.name]) == 1:
if (isinstance(json[element.name], dict) and
json[element.name].get(text_name) and
len(json[element.name]) == 1):
return json[element.name][text_name]
json = json[element.name]
elif isinstance(element, Element.Comment) and self.include_comments:
Expand All @@ -314,14 +327,33 @@ def to_json(self,
json['doctype'] = str(element)
json.update(self.to_json(element.next_element))
elif isinstance(element, (Iterator, Iterable)):
for elem in element:
name = self.__get_name(elem)
value = self.to_json(elem) or None
if not value and name == text_name:
continue
if name in json:
json[name].append(value)
else:
json[name] = [value]
self.__fix(json)
if self.keep_order:
# Return a list preserving order instead of grouping by type
ordered_list = []
for elem in element:
name = self.__get_name(elem)
value = self.to_json(elem) or None
if not value and name == text_name:
continue

# Simplify single-text elements like {"h3": [{"text": "chapter 1"}]} to {"h3": "chapter 1"}
if (isinstance(value, list) and len(value) == 1 and
isinstance(value[0], dict) and len(value[0]) == 1 and
text_name in value[0]):
value = value[0][text_name]

ordered_list.append({name: value})
return ordered_list
else:
# Original behavior: group by type
for elem in element:
name = self.__get_name(elem)
value = self.to_json(elem) or None
if not value and name == text_name:
continue
if name in json:
json[name].append(value)
else:
json[name] = [value]
self.__fix(json)
return json
37 changes: 37 additions & 0 deletions tests/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,5 +72,42 @@ def test_prettify(self):
result = out['stdout']
self.assertEqual(result, expected_4)

def test_keep_order_feature(self):
"""Test the keep_order feature preserves element order."""
html_ordered = '''<html><body>
<h3>first heading</h3>
<p>paragraph</p>
<h3>second heading</h3>
<hr>
</body></html>'''

# Test default behavior (grouping)
bs2json_default = BS2Json(html_ordered)
result_default = bs2json_default.convert()
self.assertEqual(result_default['html']['body']['h3'], ['first heading', 'second heading'])

# Test keep_order behavior (preserving order)
bs2json_ordered = BS2Json(html_ordered, keep_order=True)
result_ordered = bs2json_ordered.convert()

# Extract body content
body_content = None
for item in result_ordered['html']:
if 'body' in item:
body_content = item['body']
break

self.assertIsNotNone(body_content)
self.assertIsInstance(body_content, list)

# Verify order is preserved: h3, p, h3, hr
expected_order = ['h3', 'p', 'h3', 'hr']
actual_order = [list(element.keys())[0] for element in body_content]
self.assertEqual(actual_order, expected_order)

# Verify h3 elements are separate and simplified
self.assertEqual(body_content[0]['h3'], 'first heading')
self.assertEqual(body_content[2]['h3'], 'second heading')

if __name__ == "__main__":
unittest.main()