From dc7fa4e122532a661d9a282a4514399d5f0351d0 Mon Sep 17 00:00:00 2001 From: Jared Dillard Date: Sun, 18 May 2025 17:15:21 -0700 Subject: [PATCH] Add ability to remove directives --- sphinx_llms_txt/__init__.py | 2 ++ sphinx_llms_txt/processor.py | 25 ++++++++++++++++++++++ tests/test_llms_txt.py | 41 ++++++++++++++++++++++++++++++++++++ 3 files changed, 68 insertions(+) diff --git a/sphinx_llms_txt/__init__.py b/sphinx_llms_txt/__init__.py index 5a826f3..b549405 100644 --- a/sphinx_llms_txt/__init__.py +++ b/sphinx_llms_txt/__init__.py @@ -58,6 +58,7 @@ def build_finished(app: Sphinx, exception): "llms_txt_full_max_size": app.config.llms_txt_full_max_size, "llms_txt_directives": app.config.llms_txt_directives, "llms_txt_exclude": app.config.llms_txt_exclude, + "llms_txt_rm_directives": app.config.llms_txt_rm_directives, "html_baseurl": getattr(app.config, "html_baseurl", ""), } _manager.set_config(config) @@ -86,6 +87,7 @@ def setup(app: Sphinx) -> Dict[str, Any]: app.add_config_value("llms_txt_title", None, "env") app.add_config_value("llms_txt_summary", None, "env") app.add_config_value("llms_txt_exclude", [], "env") + app.add_config_value("llms_txt_rm_directives", False, "env") # Connect to Sphinx events app.connect("doctree-resolved", doctree_resolved) diff --git a/sphinx_llms_txt/processor.py b/sphinx_llms_txt/processor.py index ef7024b..21ee619 100644 --- a/sphinx_llms_txt/processor.py +++ b/sphinx_llms_txt/processor.py @@ -50,8 +50,33 @@ def process_content(self, content: str, source_path: Path) -> str: # Then process path directives (image, figure, etc.) content = self._process_path_directives(content, source_path) + # Remove directives if configured to do so + if self.config.get("llms_txt_rm_directives", False): + content = self._remove_directives(content) + return content + def _remove_directives(self, content: str) -> str: + """Remove directives from content. + + Args: + content: The source content from which to remove directives + + Returns: + Content with all directives removed + """ + # Match any directive pattern (starting with .. followed by ::) + directive_pattern = re.compile(r'^\s*\.\.\s+[\w\-]+::.*?$(?:\n\s+.*?$)*', + re.MULTILINE | re.DOTALL) + + # Replace all directives with an empty string + processed_content = directive_pattern.sub('', content) + + # Clean up any consecutive blank lines that might result from directive removal + processed_content = re.sub(r'\n{3,}', '\n\n', processed_content) + + return processed_content + def _extract_relative_document_path( self, source_path: Path ) -> Tuple[Optional[str], Optional[str], Optional[List[str]]]: diff --git a/tests/test_llms_txt.py b/tests/test_llms_txt.py index ddd07a1..a4ee766 100644 --- a/tests/test_llms_txt.py +++ b/tests/test_llms_txt.py @@ -334,3 +334,44 @@ def test_write_verbose_info_with_baseurl(tmp_path): assert "- [Home Page](https://example.org/index.html)" in content assert "- [About Us](https://example.org/about.html)" in content + + +def test_remove_directives(): + """Test removing directives from content.""" + # Create a processor with remove_directives enabled + config = {"llms_txt_rm_directives": True} + processor = DocumentProcessor(config) + + # Test content with various directives + content = """This is a test document. + +.. image:: /path/to/image.jpg + :alt: An example image + :width: 100% + +This is a paragraph after the image. + +.. note:: + This is a note. + +.. code-block:: python + + def hello_world(): + print("Hello, world!") + +Final paragraph.""" + + processed_content = processor._remove_directives(content) + + # Check that directives are removed + assert ".. image::" not in processed_content + assert ".. note::" not in processed_content + assert ".. code-block::" not in processed_content + + # Check that regular content is preserved + assert "This is a test document." in processed_content + assert "This is a paragraph after the image." in processed_content + assert "Final paragraph." in processed_content + + # Check that there are no excessive blank lines + assert "\n\n\n" not in processed_content