lunduke-bot/content/parser.py
2025-03-07 20:54:49 +00:00

53 lines
1.9 KiB
Python

import tomllib # For parsing the TOML-like section (Python 3.11+)
import re
from pathlib import Path
class PostParser:
def __init__(self, file_path):
"""
Initialize the parser with the path to the input file.
:param file_path: Path to the file that contains the input data.
"""
self.file_path = file_path
self.fixed_path = Path(self.file_path).expanduser()
def parse(self):
"""
Parses the file into two parts: a dictionary of TOML values and Markdown content.
:return: A tuple containing (TOML dictionary, Markdown string).
"""
try:
with open(self.fixed_path, "r") as f:
content = f.read()
# Extract the triple-quoted TOML section and Markdown using regex
match = re.match(r'"""(.*?)"""\n(.*)', content, re.DOTALL)
if match is None:
raise ValueError("Input file does not follow the expected format.")
toml_content, markdown_content = match.groups()
# Validate and parse the TOML section
self._validate_toml(toml_content.strip())
toml_dict = tomllib.loads(toml_content.strip())
return toml_dict, markdown_content.strip()
except Exception as e:
raise RuntimeError(f"Failed to parse the file '{self.file_path}'. Error: {e}")
def _validate_toml(self, toml_content):
"""
Validates the TOML section for known issues and provides descriptive error messages.
:param toml_content: The TOML content as a string.
:raises ValueError: If the TOML content contains known errors.
"""
# Check for uppercase booleans
if re.search(r"=\s*(True|False)", toml_content):
raise ValueError("TOML booleans must be lowercase (true/false).")
# Add additional TOML validation as needed