lunduke-bot/content/parser.py

import tomllib  # For parsing the TOML-like section (Python 3.11+)
import re
from pathlib import Path


class PostParser:
    def __init__(self, file_path):
        """
        Initialize the parser with the path to the input file.

        :param file_path: Path to the file that contains the input data.
        """
        self.file_path = file_path
        self.fixed_path = Path(self.file_path).expanduser()

    def parse(self):
        """
        Parses the file into two parts: a dictionary of TOML values and Markdown content.

        :return: A tuple containing (TOML dictionary, Markdown string).
        """
        try:
            with open(self.fixed_path, "r") as f:
                content = f.read()

            # Extract the triple-quoted TOML section and Markdown using regex
            match = re.match(r'"""(.*?)"""\n(.*)', content, re.DOTALL)
            if match is None:
                raise ValueError("Input file does not follow the expected format.")

            toml_content, markdown_content = match.groups()

            # Validate and parse the TOML section
            self._validate_toml(toml_content.strip())
            toml_dict = tomllib.loads(toml_content.strip())

            return toml_dict, markdown_content.strip()
        except Exception as e:
            raise RuntimeError(f"Failed to parse the file '{self.file_path}'. Error: {e}")

    def _validate_toml(self, toml_content):
        """
        Validates the TOML section for known issues and provides descriptive error messages.

        :param toml_content: The TOML content as a string.
        :raises ValueError: If the TOML content contains known errors.
        """
        # Check for uppercase booleans
        if re.search(r"=\s*(True|False)", toml_content):
            raise ValueError("TOML booleans must be lowercase (true/false).")

        # Add additional TOML validation as needed