diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..33d76fa --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,132 @@ +name: CI + +on: + push: + branches: [main, master] + pull_request: + branches: [main, master] + +permissions: + contents: read + +jobs: + test: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ubuntu-latest, windows-latest, macos-latest] + python-version: ['3.9', '3.10', '3.11', '3.12'] + + steps: + - uses: actions/checkout@v4 + + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} + + - name: Set up .NET + uses: actions/setup-dotnet@v4 + with: + dotnet-version: '8.0.x' + + - name: Install Python dependencies + run: | + python -m pip install --upgrade pip + pip install pytest pytest-cov hatch + + - name: Build engine binaries + run: python build_differ.py + + - name: Install package + run: pip install -e . + + - name: Run tests with coverage + run: | + pytest tests/ -v --cov=python_redlines --cov-report=xml --cov-report=term-missing + + - name: Upload coverage reports + uses: codecov/codecov-action@v4 + if: matrix.os == 'ubuntu-latest' && matrix.python-version == '3.11' + with: + file: ./coverage.xml + fail_ci_if_error: false + + # Quick test without binaries (unit tests only) + test-unit-only: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install pytest pytest-cov + + - name: Install package (without building binaries) + env: + SKIP_BINARY_BUILD: "1" + run: pip install -e . + + - name: Run unit tests (no integration tests) + run: | + pytest tests/ -v --ignore=tests/test_engines.py -k "not integration" + + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install ruff + + - name: Check code formatting + run: ruff check src/ tests/ --output-format=github + continue-on-error: true + + build: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Set up .NET + uses: actions/setup-dotnet@v4 + with: + dotnet-version: '8.0.x' + + - name: Install build dependencies + run: | + python -m pip install --upgrade pip + pip install hatch hatchling + + - name: Build package + run: hatch build + + - name: Check package + run: | + pip install twine + twine check dist/* + + - name: Upload build artifacts + uses: actions/upload-artifact@v4 + with: + name: dist + path: dist/ diff --git a/README.md b/README.md index 712b974..c3a25db 100644 --- a/README.md +++ b/README.md @@ -1,167 +1,305 @@ # Python-Redlines: Docx Redlines (Tracked Changes) for the Python Ecosystem +[![CI](https://github.com/JSv4/Python-Redlines/actions/workflows/ci.yml/badge.svg)](https://github.com/JSv4/Python-Redlines/actions/workflows/ci.yml) +[![PyPI version](https://badge.fury.io/py/python-redlines.svg)](https://pypi.org/project/python-redlines/) +[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) + ## Project Goal - Democratizing DOCX Comparisons The main goal of this project is to address the significant gap in the open-source ecosystem around `.docx` document -comparison tools. Currently, the process of comparing and generating redline documents (documents that highlight -changes between versions) is complex and largely dominated by commercial software. These -tools, while effective, often come with cost barriers and limitations in terms of accessibility and integration +comparison tools. Currently, the process of comparing and generating redline documents (documents that highlight +changes between versions) is complex and largely dominated by commercial software. These +tools, while effective, often come with cost barriers and limitations in terms of accessibility and integration flexibility. -`Python-redlines` aims to democratize the ability to run tracked change redlines for .docx, providing the +`Python-redlines` aims to democratize the ability to run tracked change redlines for .docx, providing the open-source community with a tool to create `.docx` redlines without the need for commercial software. This will let more legal hackers and hobbyist innovators experiment and create tooling for enterprise and legal. -## Project Roadmap +## Quick Start -### Step 1. Open-XML-PowerTools `WmlComparer` Wrapper +### Installation -The [Open-XML-PowerTools](https://github.com/OpenXmlDev/Open-Xml-PowerTools) project historically offered a solid -foundation for working with `.docx` files and has an excellent (if imperfect) comparison engine in its `WmlComparer` -class. However, Microsoft archived the repository almost five years ago, and a forked repo is not being actively -maintained, as its most recent commits dates from 2 years ago and the repo issues list is disabled. +```bash +pip install python-redlines +``` -As a first step, our project aims to bring the existing capabilities of WmlCompare into the Python world. Thankfully, -XML Power Tools is full cross-platform as it is written in .NET and compiles with the still-maintained .NET 8. The -resulting binaries can be compiled for the latest versions of Windows, OSX and Linux (Ubuntu specifically, though other -distributions should work fine too). We have included an OSX build but do not have an OSX machine to test on. Please -report an issues by opening a new Issue. +Or install directly from GitHub: -The initial release has a single engine `XmlPowerToolsEngine`, which is just a Python wrapper for a simple C# utility -written to leverage WmlComparer for 1-to-1 redlines. We hope this provides a stop-gap capability to Python developers -seeking .docx redline capabilities. +```bash +pip install git+https://github.com/JSv4/Python-Redlines +``` -**Note**, we don't plan to fork or maintain Open-XML-PowerTools. [Version 4.4.0](https://www.nuget.org/packages/Open-Xml-PowerTools/), -which appears to only be compatible with [Open XML SDK < 3.0.0](https://www.nuget.org/packages/DocumentFormat.OpenXml) works -for now, it needs to be made compatible with the latest versions of the Open XML SDK to extend its life. **There are -also some [issues](https://github.com/dotnet/Open-XML-SDK/issues/1634)**, and it seems the only maintainer of -Open-XML-PowerTools probably won't fix, and understanding the existing code base is no small task. Please be aware that -**Open XML PowerTools is not a perfect comparison engine, but it will work for many purposes. Use at your own risk.** +### Basic Usage -### Step 2. Pure Python Comparison Engine +```python +from python_redlines import get_engine -Looking towards the future, rather than reverse engineer `WmlComparer` and maintain a C# codebase, we envision a -comparison engine written in python. We've done some experimentation with [`xmldiff`](https://github.com/Shoobx/xmldiff) -as the engine to compare the underlying xml of docx files. Specifically, we've built a prototype to unzip `.docx` files, -execute an xml comparison using `xmldiff`, and then reconstructed a tracked changes docx with the proper Open XML -(ooxml) tracked change tags. Preliminary experimentation with this approach has shown promise, indicating its -feasibility for handling modifications such as simple span inserts and deletes. +# Get a comparison engine (default: openxml-powertools) +engine = get_engine() -However, this ambitious endeavor is not without its challenges. The intricacies of `.docx` files and the potential for -complex, corner-case scenarios necessitate a thoughtful and thorough development process. In the interim, `WmlComparer` -is a great solution as it has clearly been built to account for many such corner cases, through a development process -that clearly was influenced by issues discovered by a large user base. The XMLDiff engine will take some time to reach -a level of maturity similar to WmlComparer. At the moment it is NOT included. +# Load your documents +with open("original.docx", "rb") as f: + original = f.read() +with open("modified.docx", "rb") as f: + modified = f.read() -## Getting started +# Generate redlined document +redline_bytes, stdout, stderr = engine.compare( + author="John Doe", + original=original, + modified=modified +) -### Install .NET Core 8 +# Save the result +with open("redlined.docx", "wb") as f: + f.write(redline_bytes) +``` -The Open-XML-PowerTools engine we're using in the initial releases requires .NET to run (don't worry, this is very -well-supported cross-platform at the moment). Our builds are targeting x86-64 Linux and Windows, however, so you'll -need to modify the build script and build new binaries if you want to target another runtime / architecture. +## Comparison Engines -#### On Linux +Python-Redlines uses a **pluggable engine architecture** that allows you to choose between different comparison backends. This design provides flexibility and allows the library to evolve as better comparison tools become available. -You can follow [Microsoft's instructions for your Linux distribution](https://learn.microsoft.com/en-us/dotnet/core/install/linux) +### Available Engines -#### On Windows +| Engine | Name | Status | Description | +|--------|------|--------|-------------| +| [Open-Xml-PowerTools](https://github.com/OpenXmlDev/Open-Xml-PowerTools) | `openxml-powertools` | Default | Original WmlComparer engine. Stable but no longer maintained. | +| [Docxodus](https://github.com/JSv4/Docxodus) | `docxodus` | Optional | Modern .NET 8.0 fork with improved features. **Will become default in future release.** | -You can follow [Microsoft's instructions for your Windows vesrion](https://learn.microsoft.com/en-us/dotnet/core/install/windows?tabs=net80) +### Selecting an Engine -### Install the Library +```python +from python_redlines import get_engine, list_engines -At the moment, we are not distributing via pypi. You can easily install directly from this repo, however. +# See available engines +print(list_engines()) # ['openxml-powertools', 'docxodus'] -```commandline -pip install git+https://github.com/JSv4/Python-Redlines +# Use the default engine (currently openxml-powertools) +engine = get_engine() + +# Or explicitly select an engine +engine = get_engine('openxml-powertools') +engine = get_engine('docxodus') ``` -You can add this as a dependency like so +### Engine Comparison -```requirements -python_redlines @ git+https://github.com/JSv4/Python-Redlines@v0.0.1 -``` +#### Open-Xml-PowerTools (Current Default) -### Use the Library +The [Open-Xml-PowerTools](https://github.com/OpenXmlDev/Open-Xml-PowerTools) engine uses the `WmlComparer` class from Microsoft's archived repository. While stable and well-tested, it has limitations: -If you just want to use the tool, jump into our [quickstart guide](docs/quickstart.md). +- ⚠️ Repository archived ~5 years ago +- ⚠️ Limited compatibility with newer Open XML SDK versions +- ⚠️ No active maintenance -## Architecture Overview +#### Docxodus (Recommended, Future Default) -`XmlPowerToolsEngine` is a Python wrapper class for the `redlines` C# command-line tool, source of which is available in -[./csproj/Program.cs](./csproj/Program.cs). The redlines utility and wrapper let you compare two docx files and -show the differences in tracked changes (a "redline" document). +[Docxodus](https://github.com/JSv4/Docxodus) is a modernized fork of Open-Xml-PowerTools, upgraded to .NET 8.0 with active maintenance and improved features: -### C# Functionality +- ✅ **Move Detection** - Identifies when content is relocated rather than deleted and re-inserted +- ✅ **Format Change Detection** - Recognizes styling-only modifications (bold, italic, font changes) +- ✅ **Active Maintenance** - Regular updates and bug fixes +- ✅ **.NET 8.0** - Modern framework support +- ✅ **Configurable Thresholds** - Fine-tune comparison sensitivity -The `redlines` C# utility is a command line tool that requires four arguments: -1. `author_tag` - A tag to identify the author of the changes. -2. `original_path.docx` - Path to the original document. -3. `modified_path.docx` - Path to the modified document. -4. `redline_path.docx` - Path where the redlined document will be saved. +**We plan to make Docxodus the default engine in a future release** once it has been thoroughly tested in production environments. -The Python wrapper, `XmlPowerToolsEngine` and its main method `run_redline()`, simplifies the use of `redlines` by -orchestrating its execution with Python and letting you pass in bytes or file paths for the original and modified -documents. +## API Reference -### Packaging +### Core Functions + +```python +from python_redlines import ( + get_engine, # Get an engine instance by name + list_engines, # List all registered engine names + list_available_engines, # List engines with binaries installed +) +``` + +### Engine Interface + +All engines implement the `ComparisonEngine` interface: + +```python +class ComparisonEngine: + @property + def name(self) -> str: + """Engine identifier (e.g., 'openxml-powertools', 'docxodus')""" + + @property + def description(self) -> str: + """Human-readable description""" + + def compare( + self, + author: str, + original: Union[bytes, Path], + modified: Union[bytes, Path] + ) -> Tuple[bytes, Optional[str], Optional[str]]: + """ + Compare two documents and return redlined version. + + Returns: + - bytes: The redlined document + - str | None: Standard output from comparison + - str | None: Standard error (if any) + """ + + def is_available(self) -> bool: + """Check if engine binaries are installed""" +``` + +### Backward Compatibility + +The original `XmlPowerToolsEngine` API is still supported: + +```python +from python_redlines import XmlPowerToolsEngine + +engine = XmlPowerToolsEngine() +redline_bytes, stdout, stderr = engine.run_redline( + author_tag="Author", + original=original_bytes, + modified=modified_bytes +) +``` + +## Architecture Overview + +### Project Structure -The project is structured as follows: ``` python-redlines/ -│ -├── csproj/ -│ ├── bin/ -│ ├── obj/ -│ ├── Program.cs -│ ├── redlines.csproj -│ └── redlines.sln -│ -├── docs/ -│ ├── developer-guide.md -│ └── quickstart.md -│ -├── src/ -│ └── python_redlines/ -│ ├── bin/ -│ │ └── .gitignore -│ ├── dist/ -│ │ ├── .gitignore -│ │ ├── linux-x64-0.0.1.tar.gz -│ │ └── win-x64-0.0.1.zip -│ ├── __about__.py -│ ├── __init__.py -│ └── engines.py -│ +├── csproj/ # Open-Xml-PowerTools CLI wrapper +│ └── Program.cs +├── csproj-docxodus/ # Docxodus CLI wrapper +│ └── Program.cs +├── src/python_redlines/ +│ ├── base.py # Abstract ComparisonEngine interface +│ ├── engines.py # Engine implementations +│ ├── registry.py # Engine discovery and registration +│ ├── dist/ +│ │ ├── openxml-powertools/ # Open-Xml-PowerTools binaries +│ │ └── docxodus/ # Docxodus binaries +│ └── bin/ # Extracted binaries (runtime) ├── tests/ -| ├── fixtures/ -| ├── test_openxml_differ.py -| └── __init__.py -| -├── .gitignore -├── build_differ.py -├── extract_version.py -├── License.md -├── pyproject.toml -└── README.md +│ ├── test_base.py +│ ├── test_engines.py +│ ├── test_registry.py +│ └── fixtures/ +└── build_differ.py # Build script for all engines ``` -- `src/your_package/`: Contains the Python wrapper code. -- `dist/`: Contains the zipped C# binaries for different platforms. -- `bin/`: Target directory for extracted binaries. -- `tests/`: Contains test cases and fixtures for the wrapper. +### How It Works + +1. **Python Wrapper** - The `engines.py` module provides Python classes that wrap .NET CLI tools +2. **Binary Management** - Platform-specific binaries are bundled and extracted at runtime +3. **Pluggable Design** - New engines can be added by implementing the `ComparisonEngine` interface + +### Supported Platforms + +| Platform | Architecture | Status | +|----------|-------------|--------| +| Linux | x64 | ✅ Supported | +| Linux | ARM64 | ✅ Supported | +| Windows | x64 | ✅ Supported | +| Windows | ARM64 | ✅ Supported | +| macOS | x64 | ✅ Supported | +| macOS | ARM64 (Apple Silicon) | ✅ Supported | + +## Development + +### Prerequisites + +- Python 3.8+ +- .NET 8.0 SDK (for building binaries) -### Detailed Explanation and Dev Setup +### Setup -If you want to contribute to the library or want to dive into some of the C# packaging architecture, go to our -[developer guide](docs/developer-guide.md). +```bash +# Clone the repository +git clone https://github.com/JSv4/Python-Redlines.git +cd Python-Redlines -## Additional Information +# Install in development mode +pip install -e ".[dev]" -- **Contributing**: Contributions to the project should follow the established coding and documentation standards. -- **Issues and Support**: For issues, feature requests, or support, please use the project's issue tracker on GitHub. +# Build engine binaries +python build_differ.py + +# Run tests +pytest tests/ -v +``` + +### Building Specific Engines + +```bash +# Build all engines +python build_differ.py + +# Build only Open-Xml-PowerTools +python build_differ.py --engine openxml-powertools + +# Build only Docxodus +python build_differ.py --engine docxodus + +# Build for a specific platform +python build_differ.py --platform linux-x64 +``` + +### Running Tests + +```bash +# Run all tests +pytest tests/ -v + +# Run with coverage +pytest tests/ -v --cov=python_redlines + +# Run only unit tests (no binaries needed) +pytest tests/ -v -k "not integration" +``` + +## Roadmap + +### Current State (v0.0.4+) + +- ✅ Pluggable engine architecture +- ✅ Open-Xml-PowerTools engine (default) +- ✅ Docxodus engine (optional) +- ✅ Cross-platform support (Linux, Windows, macOS) +- ✅ Comprehensive test suite + +### Planned + +- 🔄 Make Docxodus the default engine +- 📋 Pure Python comparison engine (using [xmldiff](https://github.com/Shoobx/xmldiff)) +- 📋 Additional comparison options and configuration +- 📋 Better error messages and diagnostics + +## Documentation + +- [Quick Start Guide](docs/quickstart.md) +- [Developer Guide](docs/developer-guide.md) + +## Contributing + +Contributions are welcome! Please: + +1. Fork the repository +2. Create a feature branch +3. Make your changes with tests +4. Submit a pull request + +For bugs or feature requests, please [open an issue](https://github.com/JSv4/Python-Redlines/issues). ## License -MIT +[MIT](License.md) + +## Acknowledgments + +- [Open-Xml-PowerTools](https://github.com/OpenXmlDev/Open-Xml-PowerTools) - Original WmlComparer implementation +- [Docxodus](https://github.com/JSv4/Docxodus) - Modernized fork with active maintenance +- [DocumentFormat.OpenXml](https://github.com/dotnet/Open-XML-SDK) - Microsoft's Open XML SDK diff --git a/build_differ.py b/build_differ.py index 0c6ab53..10ad85b 100644 --- a/build_differ.py +++ b/build_differ.py @@ -1,7 +1,41 @@ +""" +Build script for compiling comparison engine binaries. + +This script builds self-contained .NET executables for multiple platforms +for both the Open-Xml-PowerTools and Docxodus comparison engines. +""" + import subprocess import os +import sys import tarfile import zipfile +import argparse + + +# Engine configurations +ENGINES = { + 'openxml-powertools': { + 'csproj_path': './csproj', + 'binary_name': 'redlines', + 'dist_subdir': 'openxml-powertools', + }, + 'docxodus': { + 'csproj_path': './csproj-docxodus', + 'binary_name': 'redline', + 'dist_subdir': 'docxodus', + }, +} + +# Platform configurations +PLATFORMS = [ + {'rid': 'linux-x64', 'archive_ext': '.tar.gz'}, + {'rid': 'linux-arm64', 'archive_ext': '.tar.gz'}, + {'rid': 'win-x64', 'archive_ext': '.zip'}, + {'rid': 'win-arm64', 'archive_ext': '.zip'}, + {'rid': 'osx-x64', 'archive_ext': '.tar.gz'}, + {'rid': 'osx-arm64', 'archive_ext': '.tar.gz'}, +] def get_version(): @@ -14,19 +48,36 @@ def get_version(): return about['__version__'] -def run_command(command): +def run_command(command, check=True): """ Runs a shell command and prints its output. + Returns True if successful, False otherwise. """ - process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + print(f"Running: {command}") + process = subprocess.Popen( + command, + shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT + ) + output_lines = [] for line in process.stdout: - print(line.decode().strip()) + decoded = line.decode().strip() + print(decoded) + output_lines.append(decoded) + + process.wait() + if check and process.returncode != 0: + print(f"Command failed with return code {process.returncode}") + return False + return True def compress_files(source_dir, target_file): """ Compresses files in the specified directory into a tar.gz or zip file. """ + print(f"Compressing {source_dir} to {target_file}") if target_file.endswith('.tar.gz'): with tarfile.open(target_file, "w:gz") as tar: tar.add(source_dir, arcname=os.path.basename(source_dir)) @@ -34,75 +85,123 @@ def compress_files(source_dir, target_file): with zipfile.ZipFile(target_file, 'w', zipfile.ZIP_DEFLATED) as zipf: for root, dirs, files in os.walk(source_dir): for file in files: - zipf.write(os.path.join(root, file), - os.path.relpath(os.path.join(root, file), - os.path.join(source_dir, '..'))) + file_path = os.path.join(root, file) + arcname = os.path.relpath(file_path, os.path.dirname(source_dir)) + zipf.write(file_path, arcname) def cleanup_old_builds(dist_dir, current_version): """ Deletes any build files ending in .zip or .tar.gz in the dist_dir with a different version tag. """ + if not os.path.exists(dist_dir): + return + for file in os.listdir(dist_dir): - if not file.endswith((f'{current_version}.zip', f'{current_version}.tar.gz', '.gitignore')): + if file.endswith(('.zip', '.tar.gz')) and current_version not in file: file_path = os.path.join(dist_dir, file) os.remove(file_path) print(f"Deleted old build file: {file}") -def main(): - version = get_version() - print(f"Version: {version}") - - dist_dir = "./src/python_redlines/dist/" +def build_engine(engine_name, engine_config, version, platforms=None): + """ + Build binaries for a specific engine. - # Build for Linux x64 - print("Building for Linux x64...") - run_command('dotnet publish ./csproj -c Release -r linux-x64 --self-contained') + Args: + engine_name: Name of the engine + engine_config: Configuration dict for the engine + version: Version string + platforms: Optional list of platforms to build (default: all) + """ + csproj_path = engine_config['csproj_path'] + binary_name = engine_config['binary_name'] + dist_subdir = engine_config['dist_subdir'] - # Build for Linux ARM64 - print("Building for Linux ARM64...") - run_command('dotnet publish ./csproj -c Release -r linux-arm64 --self-contained') + dist_dir = f"./src/python_redlines/dist/{dist_subdir}/" - # Build for Windows x64 - print("Building for Windows x64...") - run_command('dotnet publish ./csproj -c Release -r win-x64 --self-contained') + # Ensure dist directory exists + os.makedirs(dist_dir, exist_ok=True) - # Build for Windows ARM64 - print("Building for Windows ARM64...") - run_command('dotnet publish ./csproj -c Release -r win-arm64 --self-contained') + platforms_to_build = platforms or PLATFORMS - # Build for macOS x64 - print("Building for macOS x64...") - run_command('dotnet publish ./csproj -c Release -r osx-x64 --self-contained') + print(f"\n{'='*60}") + print(f"Building {engine_name} engine") + print(f"{'='*60}\n") - # Build for macOS ARM64 - print("Building for macOS ARM64...") - run_command('dotnet publish ./csproj -c Release -r osx-arm64 --self-contained') + for platform_config in platforms_to_build: + rid = platform_config['rid'] + archive_ext = platform_config['archive_ext'] - # Compress the Linux x64 build - linux_x64_build_dir = './csproj/bin/Release/net8.0/linux-x64' - compress_files(linux_x64_build_dir, f"{dist_dir}/linux-x64-{version}.tar.gz") + print(f"\nBuilding for {rid}...") - # Compress the Linux ARM64 build - linux_arm64_build_dir = './csproj/bin/Release/net8.0/linux-arm64' - compress_files(linux_arm64_build_dir, f"{dist_dir}/linux-arm64-{version}.tar.gz") + # Build the binary + cmd = f'dotnet publish {csproj_path} -c Release -r {rid} --self-contained' + if not run_command(cmd, check=False): + print(f"Warning: Build failed for {rid}") + continue - # Compress the Windows x64 build - windows_build_dir = './csproj/bin/Release/net8.0/win-x64' - compress_files(windows_build_dir, f"{dist_dir}/win-x64-{version}.zip") + # Determine build output directory + build_dir = f'{csproj_path}/bin/Release/net8.0/{rid}' - # Compress the macOS x64 build - macos_x64_build_dir = './csproj/bin/Release/net8.0/osx-x64' - compress_files(macos_x64_build_dir, f"{dist_dir}/osx-x64-{version}.tar.gz") + # Check if build directory exists + if not os.path.exists(build_dir): + print(f"Warning: Build directory not found: {build_dir}") + continue - # Compress the macOS ARM64 build - macos_arm64_build_dir = './csproj/bin/Release/net8.0/osx-arm64' - compress_files(macos_arm64_build_dir, f"{dist_dir}/osx-arm64-{version}.tar.gz") + # Compress to archive + archive_name = f"{rid}-{version}{archive_ext}" + archive_path = os.path.join(dist_dir, archive_name) + compress_files(build_dir, archive_path) + print(f"Created: {archive_path}") + # Cleanup old builds cleanup_old_builds(dist_dir, version) - print("Build and compression complete.") + print(f"\n{engine_name} build complete.") + + +def main(): + parser = argparse.ArgumentParser( + description='Build comparison engine binaries for multiple platforms.' + ) + parser.add_argument( + '--engine', + choices=['all'] + list(ENGINES.keys()), + default='all', + help='Which engine to build (default: all)' + ) + parser.add_argument( + '--platform', + choices=['all'] + [p['rid'] for p in PLATFORMS], + default='all', + help='Which platform to build for (default: all)' + ) + + args = parser.parse_args() + + version = get_version() + print(f"Version: {version}") + + # Determine which platforms to build + if args.platform == 'all': + platforms = PLATFORMS + else: + platforms = [p for p in PLATFORMS if p['rid'] == args.platform] + + # Determine which engines to build + if args.engine == 'all': + engines_to_build = ENGINES.items() + else: + engines_to_build = [(args.engine, ENGINES[args.engine])] + + # Build each engine + for engine_name, engine_config in engines_to_build: + build_engine(engine_name, engine_config, version, platforms) + + print("\n" + "="*60) + print("All builds complete.") + print("="*60) if __name__ == "__main__": diff --git a/csproj-docxodus/Program.cs b/csproj-docxodus/Program.cs new file mode 100644 index 0000000..990d605 --- /dev/null +++ b/csproj-docxodus/Program.cs @@ -0,0 +1,73 @@ +using System; +using System.IO; +using Docxodus; +using DocumentFormat.OpenXml.Packaging; + +class Program +{ + static int Main(string[] args) + { + // Parse arguments: [--author=] + if (args.Length < 3) + { + Console.WriteLine("Usage: redline [--author=]"); + return 1; + } + + string originalFilePath = args[0]; + string modifiedFilePath = args[1]; + string outputFilePath = args[2]; + string authorTag = "Redline"; + + // Parse optional --author flag + for (int i = 3; i < args.Length; i++) + { + if (args[i].StartsWith("--author=")) + { + authorTag = args[i].Substring("--author=".Length); + } + } + + if (!File.Exists(originalFilePath)) + { + Console.Error.WriteLine($"Error: Original file does not exist: {originalFilePath}"); + return 1; + } + + if (!File.Exists(modifiedFilePath)) + { + Console.Error.WriteLine($"Error: Modified file does not exist: {modifiedFilePath}"); + return 1; + } + + try + { + var originalBytes = File.ReadAllBytes(originalFilePath); + var modifiedBytes = File.ReadAllBytes(modifiedFilePath); + var originalDocument = new WmlDocument(originalFilePath, originalBytes); + var modifiedDocument = new WmlDocument(modifiedFilePath, modifiedBytes); + + var comparisonSettings = new WmlComparerSettings + { + AuthorForRevisions = authorTag, + DetailThreshold = 0 + }; + + var comparisonResults = WmlComparer.Compare(originalDocument, modifiedDocument, comparisonSettings); + var revisions = WmlComparer.GetRevisions(comparisonResults, comparisonSettings); + + // Output results + Console.WriteLine($"Revisions found: {revisions.Count}"); + + File.WriteAllBytes(outputFilePath, comparisonResults.DocumentByteArray); + return 0; + } + catch (Exception ex) + { + Console.Error.WriteLine($"Error: {ex.Message}"); + Console.Error.WriteLine("Detailed Stack Trace:"); + Console.Error.WriteLine(ex.StackTrace); + return 1; + } + } +} diff --git a/csproj-docxodus/redline.csproj b/csproj-docxodus/redline.csproj new file mode 100644 index 0000000..85bdfbc --- /dev/null +++ b/csproj-docxodus/redline.csproj @@ -0,0 +1,15 @@ + + + + Exe + net8.0 + enable + enable + redline + + + + + + + diff --git a/hatch_run_build_hook.py b/hatch_run_build_hook.py index f20b2b1..104ad70 100644 --- a/hatch_run_build_hook.py +++ b/hatch_run_build_hook.py @@ -1,9 +1,46 @@ +""" +Hatch build hook for building comparison engine binaries. + +This hook runs during the package build process to compile the +.NET binaries for all supported comparison engines and platforms. +""" + +import os import subprocess from hatchling.builders.hooks.plugin.interface import BuildHookInterface + class HatchRunBuildHook(BuildHookInterface): PLUGIN_NAME = 'hatch-run-build' def initialize(self, version, build_data): - # Run the 'hatch run build' command - subprocess.run(["python", "-m", "build_differ"], check=True) \ No newline at end of file + """ + Initialize the build hook by compiling engine binaries. + + This runs the build_differ.py script which compiles self-contained + .NET executables for all engines and platforms. + """ + # Check if we should skip the build (useful for development) + if os.environ.get('SKIP_BINARY_BUILD', '').lower() in ('1', 'true', 'yes'): + print("Skipping binary build (SKIP_BINARY_BUILD is set)") + return + + # Run the build script + print("Building comparison engine binaries...") + try: + result = subprocess.run( + ["python", "-m", "build_differ"], + check=True, + capture_output=True, + text=True + ) + if result.stdout: + print(result.stdout) + except subprocess.CalledProcessError as e: + print(f"Warning: Binary build failed: {e}") + if e.stdout: + print(f"stdout: {e.stdout}") + if e.stderr: + print(f"stderr: {e.stderr}") + # Don't fail the build - binaries might already exist + # or the user might be installing on a platform we don't build for diff --git a/pyproject.toml b/pyproject.toml index 4494c19..cd87036 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,11 +23,11 @@ path = "hatch_run_build_hook.py" [project] name = "python-redlines" dynamic = ["version"] -description = '' +description = 'A pluggable document comparison library for generating redlined Word documents with tracked changes' readme = "README.md" requires-python = ">=3.8" license = "MIT" -keywords = [] +keywords = ["docx", "word", "redline", "comparison", "tracked-changes", "diff"] authors = [ { name = "John Scrudato IV" }, ] @@ -41,16 +41,25 @@ classifiers = [ "Programming Language :: Python :: 3.12", "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", + "Topic :: Office/Business :: Office Suites", + "Topic :: Text Processing :: Markup", ] -dependencies = [ +dependencies = [] + +[project.optional-dependencies] +# For development/building +dev = [ "hatch", - "hatchling" + "hatchling", + "coverage[toml]>=6.5", + "pytest", + "pytest-cov", ] [project.urls] -Documentation = "https://github.com/unknown/python-redlines#readme" -Issues = "https://github.com/unknown/python-redlines/issues" -Source = "https://github.com/unknown/python-redlines" +Documentation = "https://github.com/JSv4/Python-Redlines#readme" +Issues = "https://github.com/JSv4/Python-Redlines/issues" +Source = "https://github.com/JSv4/Python-Redlines" [tool.hatch.version] path = "src/python_redlines/__about__.py" @@ -59,7 +68,9 @@ path = "src/python_redlines/__about__.py" dependencies = [ "coverage[toml]>=6.5", "pytest", + "pytest-cov", ] + [tool.hatch.envs.default.scripts] test = "pytest {args:tests}" test-cov = "coverage run -m pytest {args:tests}" @@ -71,7 +82,9 @@ cov = [ "test-cov", "cov-report", ] -build = "python -m build_differ" +build-engines = "python -m build_differ" +build-openxml = "python -m build_differ --engine openxml-powertools" +build-docxodus = "python -m build_differ --engine docxodus" [[tool.hatch.envs.all.matrix]] python = ["3.8", "3.9", "3.10", "3.11", "3.12"] @@ -101,3 +114,9 @@ exclude_lines = [ "if __name__ == .__main__.:", "if TYPE_CHECKING:", ] + +[tool.pytest.ini_options] +testpaths = ["tests"] +python_files = ["test_*.py"] +python_functions = ["test_*"] +addopts = "-v --tb=short" diff --git a/src/python_redlines/__init__.py b/src/python_redlines/__init__.py index ec15624..6f5d31e 100644 --- a/src/python_redlines/__init__.py +++ b/src/python_redlines/__init__.py @@ -1,3 +1,74 @@ -# SPDX-FileCopyrightText: 2024-present U.N. Owen +# SPDX-FileCopyrightText: 2024-present John Scrudato IV # # SPDX-License-Identifier: MIT + +""" +Python Redlines - Document comparison library for generating redlined Word documents. + +This library provides a pluggable comparison engine system for generating +Word documents with tracked changes (redlines) from original and modified documents. + +Quick Start: + from python_redlines import get_engine + + # Get the default engine + engine = get_engine() + + # Compare two documents + redline_bytes, stdout, stderr = engine.compare( + author="John Doe", + original=original_doc_bytes, + modified=modified_doc_bytes + ) + + # Save the result + with open("redlined.docx", "wb") as f: + f.write(redline_bytes) + +Available Engines: + - 'openxml-powertools': Uses Open-Xml-PowerTools WmlComparer (default) + - 'docxodus': Uses Docxodus, a modern .NET 8.0 fork with improved features + +Selecting an Engine: + # Use the default engine (openxml-powertools) + engine = get_engine() + + # Use a specific engine + engine = get_engine('docxodus') + + # List available engines + from python_redlines import list_available_engines + engines = list_available_engines() +""" + +from .__about__ import __version__ + +# Import base classes +from .base import ComparisonEngine, ComparisonError + +# Import engine implementations +from .engines import XmlPowerToolsEngine, DocxodusEngine + +# Import registry functions +from .registry import ( + get_engine, + list_engines, + list_available_engines, + EngineRegistry, +) + +__all__ = [ + # Version + "__version__", + # Base classes + "ComparisonEngine", + "ComparisonError", + # Engine implementations + "XmlPowerToolsEngine", + "DocxodusEngine", + # Registry + "get_engine", + "list_engines", + "list_available_engines", + "EngineRegistry", +] diff --git a/src/python_redlines/base.py b/src/python_redlines/base.py new file mode 100644 index 0000000..2981acf --- /dev/null +++ b/src/python_redlines/base.py @@ -0,0 +1,120 @@ +""" +Abstract base class for document comparison engines. + +This module provides the foundation for a pluggable comparison engine system, +allowing different backends (e.g., Open-Xml-PowerTools, Docxodus) to be used +interchangeably for generating redlined Word documents with tracked changes. +""" + +from abc import ABC, abstractmethod +from pathlib import Path +from typing import Union, Tuple, Optional + + +class ComparisonEngine(ABC): + """ + Abstract base class for document comparison engines. + + All comparison engines must implement the `compare` method which takes + two Word documents and produces a redlined version with tracked changes. + + Example usage: + engine = SomeComparisonEngine() + redline_bytes, stdout, stderr = engine.compare( + author="John Doe", + original=original_doc_bytes, + modified=modified_doc_bytes + ) + """ + + @property + @abstractmethod + def name(self) -> str: + """ + Return the unique identifier name for this engine. + + Returns: + str: Engine name (e.g., 'openxml-powertools', 'docxodus') + """ + pass + + @property + @abstractmethod + def description(self) -> str: + """ + Return a human-readable description of this engine. + + Returns: + str: Description of the engine and its capabilities + """ + pass + + @abstractmethod + def compare( + self, + author: str, + original: Union[bytes, Path], + modified: Union[bytes, Path] + ) -> Tuple[bytes, Optional[str], Optional[str]]: + """ + Compare two Word documents and generate a redlined version with tracked changes. + + Args: + author: The author name to attribute revisions to in the tracked changes. + original: The original document, either as bytes or a Path to a .docx file. + modified: The modified document, either as bytes or a Path to a .docx file. + + Returns: + A tuple containing: + - bytes: The redlined document as bytes + - Optional[str]: Standard output from the comparison process (if any) + - Optional[str]: Standard error from the comparison process (if any) + + Raises: + FileNotFoundError: If a path is provided but the file doesn't exist. + ComparisonError: If the comparison process fails. + """ + pass + + @abstractmethod + def is_available(self) -> bool: + """ + Check if this engine is available and properly configured. + + This method should verify that all required dependencies (e.g., binaries) + are present and the engine can be used. + + Returns: + bool: True if the engine is ready to use, False otherwise. + """ + pass + + +class ComparisonError(Exception): + """ + Exception raised when a document comparison fails. + + Attributes: + message: Human-readable error message + stdout: Standard output from the failed process (if available) + stderr: Standard error from the failed process (if available) + """ + + def __init__( + self, + message: str, + stdout: Optional[str] = None, + stderr: Optional[str] = None + ): + super().__init__(message) + self.message = message + self.stdout = stdout + self.stderr = stderr + + def __str__(self) -> str: + parts = [self.message] + if self.stdout: + parts.append(f"stdout: {self.stdout}") + if self.stderr: + parts.append(f"stderr: {self.stderr}") + return "\n".join(parts) diff --git a/src/python_redlines/dist/.gitignore b/src/python_redlines/dist/.gitignore index c96a04f..19d1530 100644 --- a/src/python_redlines/dist/.gitignore +++ b/src/python_redlines/dist/.gitignore @@ -1,2 +1,4 @@ -* -!.gitignore \ No newline at end of file +# Ignore compiled binaries, keep directory structure +*.zip +*.tar.gz +!.gitignore diff --git a/src/python_redlines/dist/docxodus/.gitignore b/src/python_redlines/dist/docxodus/.gitignore new file mode 100644 index 0000000..475a2e1 --- /dev/null +++ b/src/python_redlines/dist/docxodus/.gitignore @@ -0,0 +1,4 @@ +# Ignore compiled binaries +*.zip +*.tar.gz +!.gitignore diff --git a/src/python_redlines/dist/openxml-powertools/.gitignore b/src/python_redlines/dist/openxml-powertools/.gitignore new file mode 100644 index 0000000..475a2e1 --- /dev/null +++ b/src/python_redlines/dist/openxml-powertools/.gitignore @@ -0,0 +1,4 @@ +# Ignore compiled binaries +*.zip +*.tar.gz +!.gitignore diff --git a/src/python_redlines/engines.py b/src/python_redlines/engines.py index be80512..fde50c1 100644 --- a/src/python_redlines/engines.py +++ b/src/python_redlines/engines.py @@ -1,3 +1,10 @@ +""" +Comparison engine implementations for document redlining. + +This module provides concrete implementations of the ComparisonEngine interface +for generating redlined Word documents with tracked changes. +""" + import subprocess import tempfile import os @@ -8,60 +15,103 @@ from pathlib import Path from typing import Union, Tuple, Optional +from .base import ComparisonEngine, ComparisonError from .__about__ import __version__ logger = logging.getLogger(__name__) -class XmlPowerToolsEngine(object): - def __init__(self, target_path: Optional[str] = None): +class BinaryManager: + """ + Utility class for managing platform-specific binary extraction and caching. + + This class handles the extraction and caching of compiled binaries for + different platforms (Windows, Linux, macOS) and architectures (x64, ARM64). + """ + + def __init__( + self, + engine_name: str, + dist_subdir: str, + binary_base_name: str, + version: str, + target_path: Optional[str] = None + ): + """ + Initialize the binary manager. + + Args: + engine_name: Identifier for the engine (used in error messages) + dist_subdir: Subdirectory under dist/ where binaries are stored + binary_base_name: Base name of the binary (without extension) + version: Version string for the binaries + target_path: Optional custom path to extract binaries to + """ + self.engine_name = engine_name + self.dist_subdir = dist_subdir + self.binary_base_name = binary_base_name + self.version = version self.target_path = target_path - self.extracted_binaries_path = self.__unzip_binary() + self._binary_path: Optional[str] = None - def __unzip_binary(self): + def get_binary_path(self) -> str: """ - Unzips the appropriate C# binary for the current platform. + Get the path to the extracted binary, extracting it if necessary. + + Returns: + str: Full path to the binary executable + + Raises: + EnvironmentError: If the platform/architecture is not supported + FileNotFoundError: If the binary archive is not found """ - base_path = os.path.dirname(__file__) - binaries_path = os.path.join(base_path, 'dist') - target_path = self.target_path if self.target_path else os.path.join(base_path, 'bin') + if self._binary_path and os.path.exists(self._binary_path): + return self._binary_path + + self._binary_path = self._extract_binary() + return self._binary_path + + def _extract_binary(self) -> str: + """Extract the appropriate binary for the current platform.""" + base_path = os.path.dirname(os.path.dirname(__file__)) + module_path = os.path.dirname(__file__) + binaries_path = os.path.join(module_path, 'dist', self.dist_subdir) + target_path = self.target_path or os.path.join(module_path, 'bin', self.dist_subdir) if not os.path.exists(target_path): os.makedirs(target_path) - # Get the binary name and zip name based on the OS and architecture - binary_name, zip_name = self.__get_binaries_info() - - # Check if the binary already exists. If not, extract it. + binary_name, archive_name = self._get_platform_info() full_binary_path = os.path.join(target_path, binary_name) if not os.path.exists(full_binary_path): - zip_path = os.path.join(binaries_path, zip_name) - self.__extract_binary(zip_path, target_path) + archive_path = os.path.join(binaries_path, archive_name) + if not os.path.exists(archive_path): + raise FileNotFoundError( + f"Binary archive not found for {self.engine_name}: {archive_path}. " + f"Please ensure the package was installed correctly." + ) + self._extract_archive(archive_path, target_path) - return os.path.join(target_path, binary_name) + return full_binary_path - def __extract_binary(self, zip_path: str, target_path: str): - """ - Extracts the binary from the zip file based on the extension. Supports .zip and .tar.gz files - :parameter - zip_path: str - The path to the zip file - target_path: str - The path to extract the binary to - """ - if zip_path.endswith('.zip'): - with zipfile.ZipFile(zip_path, 'r') as zip_ref: + def _extract_archive(self, archive_path: str, target_path: str) -> None: + """Extract a .zip or .tar.gz archive to the target path.""" + if archive_path.endswith('.zip'): + with zipfile.ZipFile(archive_path, 'r') as zip_ref: zip_ref.extractall(target_path) - - elif zip_path.endswith('.tar.gz'): - with tarfile.open(zip_path, 'r:gz') as tar_ref: + elif archive_path.endswith('.tar.gz'): + with tarfile.open(archive_path, 'r:gz') as tar_ref: tar_ref.extractall(target_path) + else: + raise ValueError(f"Unsupported archive format: {archive_path}") - def __get_binaries_info(self): + def _get_platform_info(self) -> Tuple[str, str]: """ - Returns the binary name and zip name based on the OS and architecture - :return - binary_name: str - The name of the binary file - zip_name: str - The name of the zip file + Get binary name and archive name based on OS and architecture. + + Returns: + Tuple of (binary_name, archive_name) """ os_name = platform.system().lower() arch = platform.machine().lower() @@ -74,63 +124,323 @@ def __get_binaries_info(self): raise EnvironmentError(f"Unsupported architecture: {arch}") if os_name == 'linux': - zip_name = f"linux-{arch}-{__version__}.tar.gz" - binary_name = f'linux-{arch}/redlines' - + archive_name = f"linux-{arch}-{self.version}.tar.gz" + binary_name = f"linux-{arch}/{self.binary_base_name}" elif os_name == 'windows': - zip_name = f"win-{arch}-{__version__}.zip" - binary_name = f'win-{arch}/redlines.exe' - + archive_name = f"win-{arch}-{self.version}.zip" + binary_name = f"win-{arch}/{self.binary_base_name}.exe" elif os_name == 'darwin': - zip_name = f"osx-{arch}-{__version__}.tar.gz" - binary_name = f'osx-{arch}/redlines' - + archive_name = f"osx-{arch}-{self.version}.tar.gz" + binary_name = f"osx-{arch}/{self.binary_base_name}" else: - raise EnvironmentError("Unsupported OS") + raise EnvironmentError(f"Unsupported OS: {os_name}") + + return binary_name, archive_name + + def is_available(self) -> bool: + """Check if the binary is available for extraction.""" + try: + binary_name, archive_name = self._get_platform_info() + module_path = os.path.dirname(__file__) + binaries_path = os.path.join(module_path, 'dist', self.dist_subdir) + archive_path = os.path.join(binaries_path, archive_name) + return os.path.exists(archive_path) + except EnvironmentError: + return False + + +class XmlPowerToolsEngine(ComparisonEngine): + """ + Comparison engine using Open-Xml-PowerTools (WmlComparer). - return binary_name, zip_name + This engine wraps the Open-Xml-PowerTools C# library which provides + Word document comparison functionality through the WmlComparer class. + + Note: This uses the original Open-Xml-PowerTools package which is + no longer actively maintained but is stable and well-tested. + """ + + def __init__(self, target_path: Optional[str] = None): + """ + Initialize the XmlPowerToolsEngine. - def run_redline(self, author_tag: str, original: Union[bytes, Path], modified: Union[bytes, Path]) \ - -> Tuple[bytes, Optional[str], Optional[str]]: + Args: + target_path: Optional custom path for extracting binaries. + If not specified, uses the default bin/ directory. """ - Runs the redlines binary. The 'original' and 'modified' arguments can be either bytes or file paths. - Returns the redline output as bytes. + self._binary_manager = BinaryManager( + engine_name="XmlPowerTools", + dist_subdir="openxml-powertools", + binary_base_name="redlines", + version=__version__, + target_path=target_path + ) + # Eagerly extract binary to maintain backward compatibility + self._binary_path = self._binary_manager.get_binary_path() + + @property + def name(self) -> str: + return "openxml-powertools" + + @property + def description(self) -> str: + return ( + "Open-Xml-PowerTools engine using WmlComparer for Word document comparison. " + "Stable and well-tested but no longer actively maintained." + ) + + def is_available(self) -> bool: + return self._binary_manager.is_available() + + def compare( + self, + author: str, + original: Union[bytes, Path], + modified: Union[bytes, Path] + ) -> Tuple[bytes, Optional[str], Optional[str]]: + """ + Compare two Word documents and generate a redlined version. + + Args: + author: Author name for tracked changes attribution + original: Original document (bytes or Path) + modified: Modified document (bytes or Path) + + Returns: + Tuple of (redline_bytes, stdout, stderr) + + Raises: + ComparisonError: If the comparison process fails """ temp_files = [] try: + target_file = tempfile.NamedTemporaryFile(delete=False, suffix='.docx') + target_file.close() + target_path = target_file.name - target_path = tempfile.NamedTemporaryFile(delete=False).name - original_path = self._write_to_temp_file(original) if isinstance(original, bytes) else original - modified_path = self._write_to_temp_file(modified) if isinstance(modified, bytes) else modified - temp_files.extend([target_path, original_path, modified_path]) + original_path = self._write_to_temp_file(original) if isinstance(original, bytes) else str(original) + modified_path = self._write_to_temp_file(modified) if isinstance(modified, bytes) else str(modified) - command = [self.extracted_binaries_path, author_tag, original_path, modified_path, target_path] + temp_files.extend([target_path]) + if isinstance(original, bytes): + temp_files.append(original_path) + if isinstance(modified, bytes): + temp_files.append(modified_path) - # Capture stdout and stderr - result = subprocess.run(command, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) + command = [self._binary_path, author, original_path, modified_path, target_path] - stdout_output = result.stdout if isinstance(result.stdout, str) and len(result.stdout) > 0 else None - stderr_output = result.stderr if isinstance(result.stderr, str) and len(result.stderr) > 0 else None + result = subprocess.run( + command, + check=False, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True + ) - redline_output = Path(target_path).read_bytes() + stdout_output = result.stdout if result.stdout else None + stderr_output = result.stderr if result.stderr else None + + if result.returncode != 0: + raise ComparisonError( + f"Comparison failed with return code {result.returncode}", + stdout=stdout_output, + stderr=stderr_output + ) + + if not os.path.exists(target_path): + raise ComparisonError( + "Comparison did not produce output file", + stdout=stdout_output, + stderr=stderr_output + ) + redline_output = Path(target_path).read_bytes() return redline_output, stdout_output, stderr_output finally: self._cleanup_temp_files(temp_files) - def _cleanup_temp_files(self, temp_files): + def run_redline( + self, + author_tag: str, + original: Union[bytes, Path], + modified: Union[bytes, Path] + ) -> Tuple[bytes, Optional[str], Optional[str]]: + """ + Backward-compatible alias for compare(). + + Deprecated: Use compare() instead. + """ + return self.compare(author_tag, original, modified) + + # Legacy property for backward compatibility + @property + def extracted_binaries_path(self) -> str: + """Legacy property for backward compatibility.""" + return self._binary_path + + def _cleanup_temp_files(self, temp_files: list) -> None: + """Clean up temporary files.""" for file_path in temp_files: try: - os.remove(file_path) + if os.path.exists(file_path): + os.remove(file_path) except OSError as e: - print(f"Error deleting temp file {file_path}: {e}") + logger.warning(f"Error deleting temp file {file_path}: {e}") + + def _write_to_temp_file(self, data: bytes) -> str: + """Write bytes to a temporary file and return the path.""" + temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.docx') + temp_file.write(data) + temp_file.close() + return temp_file.name + - def _write_to_temp_file(self, data): +class DocxodusEngine(ComparisonEngine): + """ + Comparison engine using Docxodus (modern fork of Open-Xml-PowerTools). + + Docxodus is a .NET 8.0 modernization of Open-Xml-PowerTools with + improved move detection, format change detection, and active maintenance. + + Features over XmlPowerTools: + - Better move detection (identifies relocated content) + - Format change detection (recognizes styling-only modifications) + - Active maintenance and .NET 8.0 support + - Configurable similarity thresholds + """ + + def __init__(self, target_path: Optional[str] = None): """ - Writes bytes data to a temporary file and returns the file path. + Initialize the DocxodusEngine. + + Args: + target_path: Optional custom path for extracting binaries. + If not specified, uses the default bin/ directory. """ - temp_file = tempfile.NamedTemporaryFile(delete=False) + self._binary_manager = BinaryManager( + engine_name="Docxodus", + dist_subdir="docxodus", + binary_base_name="redline", + version=__version__, + target_path=target_path + ) + self._binary_path: Optional[str] = None + + def _ensure_binary(self) -> str: + """Ensure binary is extracted and return its path.""" + if self._binary_path is None: + self._binary_path = self._binary_manager.get_binary_path() + return self._binary_path + + @property + def name(self) -> str: + return "docxodus" + + @property + def description(self) -> str: + return ( + "Docxodus engine - a modern .NET 8.0 fork of Open-Xml-PowerTools with " + "improved move detection, format change detection, and active maintenance." + ) + + def is_available(self) -> bool: + return self._binary_manager.is_available() + + def compare( + self, + author: str, + original: Union[bytes, Path], + modified: Union[bytes, Path] + ) -> Tuple[bytes, Optional[str], Optional[str]]: + """ + Compare two Word documents and generate a redlined version. + + Args: + author: Author name for tracked changes attribution + original: Original document (bytes or Path) + modified: Modified document (bytes or Path) + + Returns: + Tuple of (redline_bytes, stdout, stderr) + + Raises: + ComparisonError: If the comparison process fails + """ + binary_path = self._ensure_binary() + temp_files = [] + + try: + target_file = tempfile.NamedTemporaryFile(delete=False, suffix='.docx') + target_file.close() + target_path = target_file.name + + original_path = self._write_to_temp_file(original) if isinstance(original, bytes) else str(original) + modified_path = self._write_to_temp_file(modified) if isinstance(modified, bytes) else str(modified) + + temp_files.append(target_path) + if isinstance(original, bytes): + temp_files.append(original_path) + if isinstance(modified, bytes): + temp_files.append(modified_path) + + # Docxodus redline CLI: redline [--author=] + command = [binary_path, original_path, modified_path, target_path, f"--author={author}"] + + result = subprocess.run( + command, + check=False, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True + ) + + stdout_output = result.stdout if result.stdout else None + stderr_output = result.stderr if result.stderr else None + + if result.returncode != 0: + raise ComparisonError( + f"Comparison failed with return code {result.returncode}", + stdout=stdout_output, + stderr=stderr_output + ) + + if not os.path.exists(target_path): + raise ComparisonError( + "Comparison did not produce output file", + stdout=stdout_output, + stderr=stderr_output + ) + + redline_output = Path(target_path).read_bytes() + return redline_output, stdout_output, stderr_output + + finally: + self._cleanup_temp_files(temp_files) + + def run_redline( + self, + author_tag: str, + original: Union[bytes, Path], + modified: Union[bytes, Path] + ) -> Tuple[bytes, Optional[str], Optional[str]]: + """ + Alias for compare() to maintain consistent API with XmlPowerToolsEngine. + """ + return self.compare(author_tag, original, modified) + + def _cleanup_temp_files(self, temp_files: list) -> None: + """Clean up temporary files.""" + for file_path in temp_files: + try: + if os.path.exists(file_path): + os.remove(file_path) + except OSError as e: + logger.warning(f"Error deleting temp file {file_path}: {e}") + + def _write_to_temp_file(self, data: bytes) -> str: + """Write bytes to a temporary file and return the path.""" + temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.docx') temp_file.write(data) temp_file.close() return temp_file.name diff --git a/src/python_redlines/registry.py b/src/python_redlines/registry.py new file mode 100644 index 0000000..b143074 --- /dev/null +++ b/src/python_redlines/registry.py @@ -0,0 +1,250 @@ +""" +Engine registry for dynamic comparison engine selection. + +This module provides a registry pattern for discovering and instantiating +comparison engines at runtime, allowing users to choose which backend +to use for document comparison. +""" + +from typing import Dict, List, Optional, Type + +from .base import ComparisonEngine + + +class EngineRegistry: + """ + Registry for managing comparison engine implementations. + + The registry allows engines to be registered, discovered, and instantiated + by name, enabling runtime selection of comparison backends. + + Example usage: + # Get the default engine + engine = EngineRegistry.get_engine() + + # Get a specific engine by name + engine = EngineRegistry.get_engine('docxodus') + + # List available engines + engines = EngineRegistry.list_available_engines() + """ + + _engines: Dict[str, Type[ComparisonEngine]] = {} + _default_engine: Optional[str] = None + + @classmethod + def register( + cls, + engine_class: Type[ComparisonEngine], + default: bool = False + ) -> None: + """ + Register a comparison engine class. + + Args: + engine_class: The engine class to register (must be a ComparisonEngine subclass) + default: If True, set this as the default engine + """ + # Create a temporary instance to get the name + # We use a try/except since the engine might fail if binaries aren't available + try: + temp_instance = object.__new__(engine_class) + # Call the name property getter directly without full initialization + name = engine_class.name.fget(temp_instance) + except Exception: + # Fallback: use class name lowercased + name = engine_class.__name__.lower().replace('engine', '') + + cls._engines[name] = engine_class + + if default or cls._default_engine is None: + cls._default_engine = name + + @classmethod + def get_engine( + cls, + name: Optional[str] = None, + **kwargs + ) -> ComparisonEngine: + """ + Get an instance of a comparison engine. + + Args: + name: Name of the engine to get. If None, returns the default engine. + **kwargs: Additional arguments to pass to the engine constructor. + + Returns: + An instance of the requested comparison engine. + + Raises: + ValueError: If the requested engine is not registered. + RuntimeError: If no engines are registered. + """ + if not cls._engines: + raise RuntimeError( + "No comparison engines registered. " + "Make sure the engines module is imported." + ) + + if name is None: + name = cls._default_engine + + if name not in cls._engines: + available = ', '.join(cls._engines.keys()) + raise ValueError( + f"Unknown engine '{name}'. Available engines: {available}" + ) + + return cls._engines[name](**kwargs) + + @classmethod + def list_engines(cls) -> List[str]: + """ + List all registered engine names. + + Returns: + List of registered engine names. + """ + return list(cls._engines.keys()) + + @classmethod + def list_available_engines(cls) -> List[str]: + """ + List engines that are actually available (binaries installed). + + Returns: + List of available engine names. + """ + available = [] + for name, engine_class in cls._engines.items(): + try: + # Try to create instance without initializing binaries + instance = engine_class.__new__(engine_class) + # Manually set up the binary manager to check availability + from .engines import BinaryManager + from .__about__ import __version__ + + if name == 'openxml-powertools': + manager = BinaryManager( + engine_name="XmlPowerTools", + dist_subdir="openxml-powertools", + binary_base_name="redlines", + version=__version__ + ) + elif name == 'docxodus': + manager = BinaryManager( + engine_name="Docxodus", + dist_subdir="docxodus", + binary_base_name="redline", + version=__version__ + ) + else: + # For other engines, try full instantiation + engine_class() + available.append(name) + continue + + if manager.is_available(): + available.append(name) + except Exception: + pass + + return available + + @classmethod + def get_default_engine_name(cls) -> Optional[str]: + """ + Get the name of the default engine. + + Returns: + Name of the default engine, or None if no engines registered. + """ + return cls._default_engine + + @classmethod + def set_default_engine(cls, name: str) -> None: + """ + Set the default engine by name. + + Args: + name: Name of the engine to set as default. + + Raises: + ValueError: If the engine is not registered. + """ + if name not in cls._engines: + available = ', '.join(cls._engines.keys()) + raise ValueError( + f"Unknown engine '{name}'. Available engines: {available}" + ) + cls._default_engine = name + + @classmethod + def clear(cls) -> None: + """ + Clear all registered engines. Mainly useful for testing. + """ + cls._engines.clear() + cls._default_engine = None + + +def get_engine(name: Optional[str] = None, **kwargs) -> ComparisonEngine: + """ + Convenience function to get a comparison engine instance. + + Args: + name: Name of the engine to get. If None, returns the default engine. + Available engines: 'openxml-powertools', 'docxodus' + **kwargs: Additional arguments to pass to the engine constructor. + + Returns: + An instance of the requested comparison engine. + + Example: + # Get the default engine + engine = get_engine() + + # Get a specific engine + engine = get_engine('docxodus') + + # Compare documents + redline_bytes, stdout, stderr = engine.compare( + author="John Doe", + original=original_bytes, + modified=modified_bytes + ) + """ + return EngineRegistry.get_engine(name, **kwargs) + + +def list_engines() -> List[str]: + """ + List all registered comparison engine names. + + Returns: + List of engine names (e.g., ['openxml-powertools', 'docxodus']) + """ + return EngineRegistry.list_engines() + + +def list_available_engines() -> List[str]: + """ + List comparison engines that have their binaries installed. + + Returns: + List of available engine names. + """ + return EngineRegistry.list_available_engines() + + +# Auto-register engines when this module is imported +def _auto_register_engines(): + """Register all known engines.""" + from .engines import XmlPowerToolsEngine, DocxodusEngine + + # Register XmlPowerToolsEngine as the default (for backward compatibility) + EngineRegistry.register(XmlPowerToolsEngine, default=True) + EngineRegistry.register(DocxodusEngine) + + +_auto_register_engines() diff --git a/tests/test_base.py b/tests/test_base.py new file mode 100644 index 0000000..fee1878 --- /dev/null +++ b/tests/test_base.py @@ -0,0 +1,169 @@ +""" +Tests for the base comparison engine classes. +""" + +import pytest +from pathlib import Path +from typing import Tuple, Optional, Union + +from python_redlines.base import ComparisonEngine, ComparisonError + + +class TestComparisonError: + """Tests for the ComparisonError exception class.""" + + def test_comparison_error_message_only(self): + """Test ComparisonError with just a message.""" + error = ComparisonError("Something went wrong") + assert error.message == "Something went wrong" + assert error.stdout is None + assert error.stderr is None + assert str(error) == "Something went wrong" + + def test_comparison_error_with_stdout(self): + """Test ComparisonError with stdout.""" + error = ComparisonError("Error occurred", stdout="output here") + assert error.message == "Error occurred" + assert error.stdout == "output here" + assert error.stderr is None + assert "stdout: output here" in str(error) + + def test_comparison_error_with_stderr(self): + """Test ComparisonError with stderr.""" + error = ComparisonError("Error occurred", stderr="error details") + assert error.message == "Error occurred" + assert error.stdout is None + assert error.stderr == "error details" + assert "stderr: error details" in str(error) + + def test_comparison_error_with_both_outputs(self): + """Test ComparisonError with both stdout and stderr.""" + error = ComparisonError( + "Comparison failed", + stdout="some output", + stderr="some error" + ) + assert error.message == "Comparison failed" + assert error.stdout == "some output" + assert error.stderr == "some error" + error_str = str(error) + assert "Comparison failed" in error_str + assert "stdout: some output" in error_str + assert "stderr: some error" in error_str + + def test_comparison_error_is_exception(self): + """Test that ComparisonError is a proper Exception.""" + error = ComparisonError("test") + assert isinstance(error, Exception) + + with pytest.raises(ComparisonError) as exc_info: + raise ComparisonError("raised error", stdout="out", stderr="err") + + assert exc_info.value.message == "raised error" + + +class TestComparisonEngineInterface: + """Tests for the ComparisonEngine abstract base class interface.""" + + def test_cannot_instantiate_abstract_class(self): + """Test that ComparisonEngine cannot be directly instantiated.""" + with pytest.raises(TypeError): + ComparisonEngine() + + def test_subclass_must_implement_name(self): + """Test that subclasses must implement the name property.""" + class IncompleteEngine(ComparisonEngine): + @property + def description(self) -> str: + return "test" + + def compare(self, author, original, modified): + pass + + def is_available(self) -> bool: + return True + + with pytest.raises(TypeError): + IncompleteEngine() + + def test_subclass_must_implement_description(self): + """Test that subclasses must implement the description property.""" + class IncompleteEngine(ComparisonEngine): + @property + def name(self) -> str: + return "test" + + def compare(self, author, original, modified): + pass + + def is_available(self) -> bool: + return True + + with pytest.raises(TypeError): + IncompleteEngine() + + def test_subclass_must_implement_compare(self): + """Test that subclasses must implement the compare method.""" + class IncompleteEngine(ComparisonEngine): + @property + def name(self) -> str: + return "test" + + @property + def description(self) -> str: + return "test" + + def is_available(self) -> bool: + return True + + with pytest.raises(TypeError): + IncompleteEngine() + + def test_subclass_must_implement_is_available(self): + """Test that subclasses must implement the is_available method.""" + class IncompleteEngine(ComparisonEngine): + @property + def name(self) -> str: + return "test" + + @property + def description(self) -> str: + return "test" + + def compare(self, author, original, modified): + pass + + with pytest.raises(TypeError): + IncompleteEngine() + + def test_complete_subclass_can_be_instantiated(self): + """Test that a complete subclass can be instantiated.""" + class CompleteEngine(ComparisonEngine): + @property + def name(self) -> str: + return "complete-engine" + + @property + def description(self) -> str: + return "A complete test engine" + + def compare( + self, + author: str, + original: Union[bytes, Path], + modified: Union[bytes, Path] + ) -> Tuple[bytes, Optional[str], Optional[str]]: + return b"result", "stdout", None + + def is_available(self) -> bool: + return True + + engine = CompleteEngine() + assert engine.name == "complete-engine" + assert engine.description == "A complete test engine" + assert engine.is_available() is True + + result, stdout, stderr = engine.compare("author", b"orig", b"mod") + assert result == b"result" + assert stdout == "stdout" + assert stderr is None diff --git a/tests/test_engines.py b/tests/test_engines.py new file mode 100644 index 0000000..5a3cc08 --- /dev/null +++ b/tests/test_engines.py @@ -0,0 +1,393 @@ +""" +Tests for the comparison engine implementations. +""" + +import os +import pytest +import tempfile +from pathlib import Path +from unittest.mock import patch, MagicMock + +from python_redlines.engines import ( + BinaryManager, + XmlPowerToolsEngine, + DocxodusEngine, +) +from python_redlines.base import ComparisonEngine, ComparisonError + + +# Helper function to load test fixtures +def load_docx_bytes(file_path): + """Load a docx file as bytes.""" + with open(file_path, 'rb') as file: + return file.read() + + +@pytest.fixture +def original_docx(): + """Load the original test document.""" + return load_docx_bytes('tests/fixtures/original.docx') + + +@pytest.fixture +def modified_docx(): + """Load the modified test document.""" + return load_docx_bytes('tests/fixtures/modified.docx') + + +class TestBinaryManager: + """Tests for the BinaryManager utility class.""" + + def test_get_platform_info_linux_x64(self): + """Test platform info detection for Linux x64.""" + manager = BinaryManager( + engine_name="test", + dist_subdir="test", + binary_base_name="testbin", + version="1.0.0" + ) + + with patch('platform.system', return_value='Linux'), \ + patch('platform.machine', return_value='x86_64'): + binary_name, archive_name = manager._get_platform_info() + + assert binary_name == "linux-x64/testbin" + assert archive_name == "linux-x64-1.0.0.tar.gz" + + def test_get_platform_info_linux_arm64(self): + """Test platform info detection for Linux ARM64.""" + manager = BinaryManager( + engine_name="test", + dist_subdir="test", + binary_base_name="testbin", + version="1.0.0" + ) + + with patch('platform.system', return_value='Linux'), \ + patch('platform.machine', return_value='aarch64'): + binary_name, archive_name = manager._get_platform_info() + + assert binary_name == "linux-arm64/testbin" + assert archive_name == "linux-arm64-1.0.0.tar.gz" + + def test_get_platform_info_windows_x64(self): + """Test platform info detection for Windows x64.""" + manager = BinaryManager( + engine_name="test", + dist_subdir="test", + binary_base_name="testbin", + version="1.0.0" + ) + + with patch('platform.system', return_value='Windows'), \ + patch('platform.machine', return_value='AMD64'): + binary_name, archive_name = manager._get_platform_info() + + assert binary_name == "win-x64/testbin.exe" + assert archive_name == "win-x64-1.0.0.zip" + + def test_get_platform_info_macos_x64(self): + """Test platform info detection for macOS x64.""" + manager = BinaryManager( + engine_name="test", + dist_subdir="test", + binary_base_name="testbin", + version="1.0.0" + ) + + with patch('platform.system', return_value='Darwin'), \ + patch('platform.machine', return_value='x86_64'): + binary_name, archive_name = manager._get_platform_info() + + assert binary_name == "osx-x64/testbin" + assert archive_name == "osx-x64-1.0.0.tar.gz" + + def test_get_platform_info_macos_arm64(self): + """Test platform info detection for macOS ARM64.""" + manager = BinaryManager( + engine_name="test", + dist_subdir="test", + binary_base_name="testbin", + version="1.0.0" + ) + + with patch('platform.system', return_value='Darwin'), \ + patch('platform.machine', return_value='arm64'): + binary_name, archive_name = manager._get_platform_info() + + assert binary_name == "osx-arm64/testbin" + assert archive_name == "osx-arm64-1.0.0.tar.gz" + + def test_unsupported_architecture_raises(self): + """Test that unsupported architecture raises EnvironmentError.""" + manager = BinaryManager( + engine_name="test", + dist_subdir="test", + binary_base_name="testbin", + version="1.0.0" + ) + + with patch('platform.system', return_value='Linux'), \ + patch('platform.machine', return_value='i386'): + with pytest.raises(EnvironmentError) as exc_info: + manager._get_platform_info() + + assert "Unsupported architecture" in str(exc_info.value) + + def test_unsupported_os_raises(self): + """Test that unsupported OS raises EnvironmentError.""" + manager = BinaryManager( + engine_name="test", + dist_subdir="test", + binary_base_name="testbin", + version="1.0.0" + ) + + with patch('platform.system', return_value='FreeBSD'), \ + patch('platform.machine', return_value='x86_64'): + with pytest.raises(EnvironmentError) as exc_info: + manager._get_platform_info() + + assert "Unsupported OS" in str(exc_info.value) + + def test_is_available_returns_false_when_no_archive(self): + """Test is_available returns False when archive doesn't exist.""" + manager = BinaryManager( + engine_name="test", + dist_subdir="nonexistent", + binary_base_name="testbin", + version="1.0.0" + ) + + assert manager.is_available() is False + + +class TestXmlPowerToolsEngine: + """Tests for the XmlPowerToolsEngine class.""" + + def test_engine_is_comparison_engine(self): + """Test that XmlPowerToolsEngine is a ComparisonEngine.""" + assert issubclass(XmlPowerToolsEngine, ComparisonEngine) + + def test_engine_name(self): + """Test the engine name property.""" + # Use mock to avoid binary extraction + with patch.object(BinaryManager, 'get_binary_path', return_value='/fake/path'): + engine = XmlPowerToolsEngine() + assert engine.name == "openxml-powertools" + + def test_engine_description(self): + """Test the engine description property.""" + with patch.object(BinaryManager, 'get_binary_path', return_value='/fake/path'): + engine = XmlPowerToolsEngine() + assert "Open-Xml-PowerTools" in engine.description + assert "WmlComparer" in engine.description + + def test_run_redline_is_alias_for_compare(self): + """Test that run_redline is an alias for compare.""" + with patch.object(BinaryManager, 'get_binary_path', return_value='/fake/path'): + engine = XmlPowerToolsEngine() + + with patch.object(engine, 'compare', return_value=(b'result', 'out', None)) as mock_compare: + result = engine.run_redline("author", b"orig", b"mod") + + mock_compare.assert_called_once_with("author", b"orig", b"mod") + assert result == (b'result', 'out', None) + + def test_extracted_binaries_path_property(self): + """Test the legacy extracted_binaries_path property.""" + with patch.object(BinaryManager, 'get_binary_path', return_value='/fake/binary/path'): + engine = XmlPowerToolsEngine() + assert engine.extracted_binaries_path == '/fake/binary/path' + + def test_custom_target_path(self): + """Test that custom target_path is passed to BinaryManager.""" + with patch.object(BinaryManager, '__init__', return_value=None) as mock_init, \ + patch.object(BinaryManager, 'get_binary_path', return_value='/fake/path'): + XmlPowerToolsEngine(target_path="/custom/target") + + # Check that BinaryManager was initialized with the custom path + call_kwargs = mock_init.call_args[1] + assert call_kwargs.get('target_path') == '/custom/target' + + +class TestDocxodusEngine: + """Tests for the DocxodusEngine class.""" + + def test_engine_is_comparison_engine(self): + """Test that DocxodusEngine is a ComparisonEngine.""" + assert issubclass(DocxodusEngine, ComparisonEngine) + + def test_engine_name(self): + """Test the engine name property.""" + engine = DocxodusEngine.__new__(DocxodusEngine) + assert engine.name == "docxodus" + + def test_engine_description(self): + """Test the engine description property.""" + engine = DocxodusEngine.__new__(DocxodusEngine) + assert "Docxodus" in engine.description + assert ".NET 8.0" in engine.description + + def test_lazy_binary_extraction(self): + """Test that binary is extracted lazily (on first use).""" + # DocxodusEngine should not extract binary on init + with patch.object(BinaryManager, '__init__', return_value=None) as mock_init, \ + patch.object(BinaryManager, 'get_binary_path') as mock_get_path: + mock_init.return_value = None + + engine = DocxodusEngine() + + # get_binary_path should NOT have been called yet + mock_get_path.assert_not_called() + + def test_run_redline_alias(self): + """Test that run_redline is available and calls compare.""" + engine = DocxodusEngine.__new__(DocxodusEngine) + engine._binary_manager = MagicMock() + engine._binary_path = None + + with patch.object(engine, 'compare', return_value=(b'result', 'out', None)) as mock_compare: + result = engine.run_redline("author", b"orig", b"mod") + + mock_compare.assert_called_once_with("author", b"orig", b"mod") + assert result == (b'result', 'out', None) + + +class TestEngineIntegration: + """Integration tests for engines with real files (when binaries available).""" + + @pytest.fixture + def engine(self): + """Try to create an XmlPowerToolsEngine, skip if binaries unavailable.""" + try: + return XmlPowerToolsEngine() + except FileNotFoundError: + pytest.skip("XmlPowerTools binaries not available") + + def test_compare_with_bytes(self, engine, original_docx, modified_docx): + """Test comparison with bytes input.""" + redline_output, stdout, stderr = engine.compare( + author="TestAuthor", + original=original_docx, + modified=modified_docx + ) + + assert redline_output is not None + assert isinstance(redline_output, bytes) + assert len(redline_output) > 0 + assert stderr is None + # Check that revisions were found + assert stdout is not None + assert "Revisions found:" in stdout + + def test_compare_with_paths(self, engine): + """Test comparison with Path input.""" + original_path = Path('tests/fixtures/original.docx') + modified_path = Path('tests/fixtures/modified.docx') + + redline_output, stdout, stderr = engine.compare( + author="TestAuthor", + original=original_path, + modified=modified_path + ) + + assert redline_output is not None + assert isinstance(redline_output, bytes) + assert len(redline_output) > 0 + + def test_compare_with_mixed_input(self, engine, original_docx): + """Test comparison with mixed bytes and Path input.""" + modified_path = Path('tests/fixtures/modified.docx') + + redline_output, stdout, stderr = engine.compare( + author="TestAuthor", + original=original_docx, # bytes + modified=modified_path # Path + ) + + assert redline_output is not None + assert isinstance(redline_output, bytes) + + def test_author_tag_in_output(self, engine, original_docx, modified_docx): + """Test that author tag is used in the output.""" + redline_output, stdout, stderr = engine.compare( + author="CustomAuthor", + original=original_docx, + modified=modified_docx + ) + + assert redline_output is not None + # The author tag should be embedded in the document + # We can't easily verify this without parsing the docx + + def test_output_is_valid_docx(self, engine, original_docx, modified_docx): + """Test that the output is a valid docx file (starts with PK zip signature).""" + redline_output, _, _ = engine.compare( + author="TestAuthor", + original=original_docx, + modified=modified_docx + ) + + # DOCX files are ZIP files, should start with PK + assert redline_output[:2] == b'PK' + + def test_temp_files_cleaned_up(self, engine, original_docx, modified_docx): + """Test that temporary files are cleaned up after comparison.""" + import glob + + # Get count of temp files before + temp_dir = tempfile.gettempdir() + temp_files_before = set(glob.glob(os.path.join(temp_dir, '*.docx'))) + + # Run comparison + engine.compare( + author="TestAuthor", + original=original_docx, + modified=modified_docx + ) + + # Get count of temp files after + temp_files_after = set(glob.glob(os.path.join(temp_dir, '*.docx'))) + + # Should not have more temp files than before + new_temp_files = temp_files_after - temp_files_before + assert len(new_temp_files) == 0, f"Temp files not cleaned up: {new_temp_files}" + + +class TestBackwardCompatibility: + """Tests for backward compatibility with existing code.""" + + @pytest.fixture + def engine(self): + """Try to create an XmlPowerToolsEngine, skip if binaries unavailable.""" + try: + return XmlPowerToolsEngine() + except FileNotFoundError: + pytest.skip("XmlPowerTools binaries not available") + + def test_run_redline_method_exists(self, engine): + """Test that the run_redline method still exists.""" + assert hasattr(engine, 'run_redline') + assert callable(engine.run_redline) + + def test_run_redline_produces_same_result_as_compare( + self, engine, original_docx, modified_docx + ): + """Test that run_redline produces the same result as compare.""" + result1 = engine.run_redline("Author", original_docx, modified_docx) + result2 = engine.compare("Author", original_docx, modified_docx) + + # stdout/stderr should match + assert result1[1] == result2[1] + assert result1[2] == result2[2] + # Output bytes should be similar (may differ slightly due to timestamps) + assert len(result1[0]) > 0 + assert len(result2[0]) > 0 + + def test_extracted_binaries_path_exists(self, engine): + """Test that the extracted_binaries_path property still exists.""" + assert hasattr(engine, 'extracted_binaries_path') + path = engine.extracted_binaries_path + assert path is not None + assert isinstance(path, str) diff --git a/tests/test_openxml_differ.py b/tests/test_openxml_differ.py index 96ea1d9..099f027 100644 --- a/tests/test_openxml_differ.py +++ b/tests/test_openxml_differ.py @@ -1,37 +1,184 @@ +""" +Tests for the OpenXML differ functionality. + +This file maintains backward compatibility with the original test suite +while also testing the new pluggable engine system. +""" + import os import pytest +from pathlib import Path from unittest.mock import patch, MagicMock from python_redlines.engines import XmlPowerToolsEngine def load_docx_bytes(file_path): + """Load a docx file as bytes.""" with open(file_path, 'rb') as file: return file.read() @pytest.fixture def original_docx(): + """Load the original test document.""" return load_docx_bytes('tests/fixtures/original.docx') @pytest.fixture def modified_docx(): + """Load the modified test document.""" return load_docx_bytes('tests/fixtures/modified.docx') def test_run_redlines_with_real_files(original_docx, modified_docx): - # Create an instance of the wrapper - wrapper = XmlPowerToolsEngine() + """Test the redline functionality with real document files. + + This is the original test from the codebase, maintained for + backward compatibility. + """ + try: + wrapper = XmlPowerToolsEngine() + except FileNotFoundError: + pytest.skip("XmlPowerTools binaries not available") author_tag = "TestAuthor" # Running the wrapper function with actual file bytes - redline_output, stdout, stderr = wrapper.run_redline(author_tag, original_docx, modified_docx) + redline_output, stdout, stderr = wrapper.run_redline( + author_tag, original_docx, modified_docx + ) - # Asserting that some output is generated (specific assertions depend on expected output) + # Asserting that some output is generated assert redline_output is not None assert isinstance(redline_output, bytes) assert len(redline_output) > 0 assert stderr is None assert "Revisions found: 9" in stdout + + +def test_compare_method_equivalent_to_run_redline(original_docx, modified_docx): + """Test that the new compare() method works like run_redline().""" + try: + wrapper = XmlPowerToolsEngine() + except FileNotFoundError: + pytest.skip("XmlPowerTools binaries not available") + + author_tag = "TestAuthor" + + # Using the new compare method + redline_output, stdout, stderr = wrapper.compare( + author_tag, original_docx, modified_docx + ) + + assert redline_output is not None + assert isinstance(redline_output, bytes) + assert len(redline_output) > 0 + assert stderr is None + assert "Revisions found: 9" in stdout + + +def test_get_engine_returns_xmlpowertools_by_default(): + """Test that get_engine() returns XmlPowerToolsEngine by default.""" + from python_redlines import get_engine + + try: + engine = get_engine() + except (FileNotFoundError, RuntimeError): + pytest.skip("Engines not available") + + assert engine.name == "openxml-powertools" + + +def test_list_engines_includes_both_engines(): + """Test that list_engines() returns both registered engines.""" + from python_redlines import list_engines + + engines = list_engines() + assert "openxml-powertools" in engines + assert "docxodus" in engines + + +def test_get_engine_by_name(): + """Test getting a specific engine by name.""" + from python_redlines import get_engine + + # Test getting openxml-powertools + try: + engine = get_engine("openxml-powertools") + assert engine.name == "openxml-powertools" + except FileNotFoundError: + pytest.skip("XmlPowerTools binaries not available") + + +def test_engine_properties(): + """Test that engine has expected properties.""" + try: + engine = XmlPowerToolsEngine() + except FileNotFoundError: + pytest.skip("XmlPowerTools binaries not available") + + # Test name property + assert engine.name == "openxml-powertools" + + # Test description property + assert len(engine.description) > 0 + assert "Open-Xml-PowerTools" in engine.description + + +def test_comparison_error_handling(): + """Test that comparison errors are properly raised.""" + from python_redlines.base import ComparisonError + + # Create an error with all attributes + error = ComparisonError( + "Test error", + stdout="some output", + stderr="some error" + ) + + assert error.message == "Test error" + assert error.stdout == "some output" + assert error.stderr == "some error" + + # Test string representation + error_str = str(error) + assert "Test error" in error_str + assert "stdout: some output" in error_str + assert "stderr: some error" in error_str + + +def test_output_file_is_valid_docx(original_docx, modified_docx): + """Test that the output is a valid DOCX file.""" + try: + wrapper = XmlPowerToolsEngine() + except FileNotFoundError: + pytest.skip("XmlPowerTools binaries not available") + + redline_output, _, _ = wrapper.compare( + "TestAuthor", original_docx, modified_docx + ) + + # DOCX files are ZIP archives, should start with PK signature + assert redline_output[:2] == b'PK', "Output should be a valid ZIP/DOCX file" + assert len(redline_output) > 100, "Output should have substantial content" + + +def test_comparison_with_path_objects(original_docx, modified_docx): + """Test comparison using Path objects instead of bytes.""" + try: + wrapper = XmlPowerToolsEngine() + except FileNotFoundError: + pytest.skip("XmlPowerTools binaries not available") + + original_path = Path('tests/fixtures/original.docx') + modified_path = Path('tests/fixtures/modified.docx') + + redline_output, stdout, stderr = wrapper.compare( + "TestAuthor", original_path, modified_path + ) + + assert redline_output is not None + assert isinstance(redline_output, bytes) + assert len(redline_output) > 0 + assert "Revisions found: 9" in stdout diff --git a/tests/test_package.py b/tests/test_package.py new file mode 100644 index 0000000..d6905af --- /dev/null +++ b/tests/test_package.py @@ -0,0 +1,148 @@ +""" +Tests for the package-level API and imports. +""" + +import pytest + + +class TestPackageImports: + """Tests for package-level imports.""" + + def test_import_package(self): + """Test that the main package can be imported.""" + import python_redlines + assert python_redlines is not None + + def test_import_version(self): + """Test that version is available.""" + from python_redlines import __version__ + assert __version__ is not None + assert isinstance(__version__, str) + + def test_import_base_classes(self): + """Test that base classes can be imported.""" + from python_redlines import ComparisonEngine, ComparisonError + assert ComparisonEngine is not None + assert ComparisonError is not None + + def test_import_engine_classes(self): + """Test that engine classes can be imported.""" + from python_redlines import XmlPowerToolsEngine, DocxodusEngine + assert XmlPowerToolsEngine is not None + assert DocxodusEngine is not None + + def test_import_registry_functions(self): + """Test that registry functions can be imported.""" + from python_redlines import ( + get_engine, + list_engines, + list_available_engines, + EngineRegistry, + ) + assert get_engine is not None + assert list_engines is not None + assert list_available_engines is not None + assert EngineRegistry is not None + + def test_all_exports(self): + """Test that __all__ contains expected exports.""" + import python_redlines + + expected_exports = [ + "__version__", + "ComparisonEngine", + "ComparisonError", + "XmlPowerToolsEngine", + "DocxodusEngine", + "get_engine", + "list_engines", + "list_available_engines", + "EngineRegistry", + ] + + for export in expected_exports: + assert export in python_redlines.__all__, f"Missing export: {export}" + assert hasattr(python_redlines, export), f"Missing attribute: {export}" + + +class TestPackageDocstrings: + """Tests for package documentation.""" + + def test_package_docstring(self): + """Test that the package has a docstring.""" + import python_redlines + assert python_redlines.__doc__ is not None + assert "Python Redlines" in python_redlines.__doc__ + + def test_comparison_engine_docstring(self): + """Test that ComparisonEngine has a docstring.""" + from python_redlines import ComparisonEngine + assert ComparisonEngine.__doc__ is not None + + def test_get_engine_docstring(self): + """Test that get_engine has a docstring.""" + from python_redlines import get_engine + assert get_engine.__doc__ is not None + + +class TestEngineRegistration: + """Tests for engine registration behavior.""" + + def test_engines_auto_registered_on_import(self): + """Test that engines are automatically registered on import.""" + from python_redlines import list_engines + + engines = list_engines() + assert len(engines) >= 2 + assert "openxml-powertools" in engines + assert "docxodus" in engines + + def test_default_engine_is_openxml_powertools(self): + """Test that the default engine is openxml-powertools.""" + from python_redlines import EngineRegistry + + default = EngineRegistry.get_default_engine_name() + assert default == "openxml-powertools" + + +class TestPackageUsagePatterns: + """Tests for common usage patterns documented in the package.""" + + def test_basic_usage_pattern(self): + """Test the basic usage pattern from the docstring.""" + from python_redlines import get_engine + + # This tests the pattern, not the actual comparison + # (which requires binaries) + try: + engine = get_engine() + assert engine.name == "openxml-powertools" + except FileNotFoundError: + pytest.skip("Binaries not available") + + def test_specific_engine_pattern(self): + """Test selecting a specific engine by name.""" + from python_redlines import get_engine + + try: + engine = get_engine("openxml-powertools") + assert engine.name == "openxml-powertools" + except FileNotFoundError: + pytest.skip("Binaries not available") + + def test_list_engines_pattern(self): + """Test listing available engines.""" + from python_redlines import list_engines + + engines = list_engines() + assert isinstance(engines, list) + assert len(engines) > 0 + + def test_invalid_engine_raises_value_error(self): + """Test that requesting an invalid engine raises ValueError.""" + from python_redlines import get_engine + + with pytest.raises(ValueError) as exc_info: + get_engine("nonexistent-engine") + + assert "nonexistent-engine" in str(exc_info.value) diff --git a/tests/test_registry.py b/tests/test_registry.py new file mode 100644 index 0000000..009c492 --- /dev/null +++ b/tests/test_registry.py @@ -0,0 +1,245 @@ +""" +Tests for the engine registry system. +""" + +import pytest +from pathlib import Path +from typing import Tuple, Optional, Union + +from python_redlines.base import ComparisonEngine +from python_redlines.registry import ( + EngineRegistry, + get_engine, + list_engines, + list_available_engines, +) + + +class MockEngine(ComparisonEngine): + """A mock engine for testing the registry.""" + + def __init__(self, target_path: Optional[str] = None): + self.target_path = target_path + + @property + def name(self) -> str: + return "mock-engine" + + @property + def description(self) -> str: + return "A mock engine for testing" + + def compare( + self, + author: str, + original: Union[bytes, Path], + modified: Union[bytes, Path] + ) -> Tuple[bytes, Optional[str], Optional[str]]: + return b"mock result", f"Author: {author}", None + + def is_available(self) -> bool: + return True + + +class AnotherMockEngine(ComparisonEngine): + """Another mock engine for testing.""" + + @property + def name(self) -> str: + return "another-mock" + + @property + def description(self) -> str: + return "Another mock engine" + + def compare( + self, + author: str, + original: Union[bytes, Path], + modified: Union[bytes, Path] + ) -> Tuple[bytes, Optional[str], Optional[str]]: + return b"another result", None, None + + def is_available(self) -> bool: + return True + + +class TestEngineRegistry: + """Tests for the EngineRegistry class.""" + + @pytest.fixture(autouse=True) + def setup_and_teardown(self): + """Save and restore registry state around each test.""" + # Save current state + saved_engines = EngineRegistry._engines.copy() + saved_default = EngineRegistry._default_engine + + yield + + # Restore state + EngineRegistry._engines = saved_engines + EngineRegistry._default_engine = saved_default + + def test_register_engine(self): + """Test registering an engine.""" + EngineRegistry.clear() + EngineRegistry.register(MockEngine) + + assert "mock-engine" in EngineRegistry.list_engines() + + def test_register_engine_as_default(self): + """Test registering an engine as the default.""" + EngineRegistry.clear() + EngineRegistry.register(MockEngine, default=True) + + assert EngineRegistry.get_default_engine_name() == "mock-engine" + + def test_first_registered_becomes_default(self): + """Test that the first registered engine becomes default if none set.""" + EngineRegistry.clear() + EngineRegistry.register(MockEngine) + + assert EngineRegistry.get_default_engine_name() == "mock-engine" + + def test_get_engine_by_name(self): + """Test getting an engine by name.""" + EngineRegistry.clear() + EngineRegistry.register(MockEngine) + + engine = EngineRegistry.get_engine("mock-engine") + assert isinstance(engine, MockEngine) + assert engine.name == "mock-engine" + + def test_get_default_engine(self): + """Test getting the default engine.""" + EngineRegistry.clear() + EngineRegistry.register(MockEngine, default=True) + EngineRegistry.register(AnotherMockEngine) + + engine = EngineRegistry.get_engine() + assert isinstance(engine, MockEngine) + + def test_get_engine_unknown_raises(self): + """Test that getting an unknown engine raises ValueError.""" + EngineRegistry.clear() + EngineRegistry.register(MockEngine) + + with pytest.raises(ValueError) as exc_info: + EngineRegistry.get_engine("nonexistent") + + assert "nonexistent" in str(exc_info.value) + assert "mock-engine" in str(exc_info.value) + + def test_get_engine_empty_registry_raises(self): + """Test that getting an engine from empty registry raises RuntimeError.""" + EngineRegistry.clear() + + with pytest.raises(RuntimeError) as exc_info: + EngineRegistry.get_engine() + + assert "No comparison engines registered" in str(exc_info.value) + + def test_list_engines(self): + """Test listing all registered engines.""" + EngineRegistry.clear() + EngineRegistry.register(MockEngine) + EngineRegistry.register(AnotherMockEngine) + + engines = EngineRegistry.list_engines() + assert "mock-engine" in engines + assert "another-mock" in engines + assert len(engines) == 2 + + def test_set_default_engine(self): + """Test setting the default engine.""" + EngineRegistry.clear() + EngineRegistry.register(MockEngine) + EngineRegistry.register(AnotherMockEngine) + + EngineRegistry.set_default_engine("another-mock") + assert EngineRegistry.get_default_engine_name() == "another-mock" + + def test_set_default_unknown_raises(self): + """Test that setting unknown engine as default raises ValueError.""" + EngineRegistry.clear() + EngineRegistry.register(MockEngine) + + with pytest.raises(ValueError): + EngineRegistry.set_default_engine("nonexistent") + + def test_get_engine_with_kwargs(self): + """Test that kwargs are passed to engine constructor.""" + EngineRegistry.clear() + EngineRegistry.register(MockEngine) + + engine = EngineRegistry.get_engine("mock-engine", target_path="/custom/path") + assert engine.target_path == "/custom/path" + + def test_clear_registry(self): + """Test clearing the registry.""" + EngineRegistry.clear() + EngineRegistry.register(MockEngine) + EngineRegistry.register(AnotherMockEngine) + + assert len(EngineRegistry.list_engines()) == 2 + + EngineRegistry.clear() + + assert len(EngineRegistry.list_engines()) == 0 + assert EngineRegistry.get_default_engine_name() is None + + +class TestConvenienceFunctions: + """Tests for the module-level convenience functions.""" + + @pytest.fixture(autouse=True) + def setup_and_teardown(self): + """Save and restore registry state around each test.""" + saved_engines = EngineRegistry._engines.copy() + saved_default = EngineRegistry._default_engine + + yield + + EngineRegistry._engines = saved_engines + EngineRegistry._default_engine = saved_default + + def test_get_engine_function(self): + """Test the get_engine convenience function.""" + EngineRegistry.clear() + EngineRegistry.register(MockEngine, default=True) + + engine = get_engine() + assert isinstance(engine, MockEngine) + + engine = get_engine("mock-engine") + assert isinstance(engine, MockEngine) + + def test_list_engines_function(self): + """Test the list_engines convenience function.""" + EngineRegistry.clear() + EngineRegistry.register(MockEngine) + EngineRegistry.register(AnotherMockEngine) + + engines = list_engines() + assert "mock-engine" in engines + assert "another-mock" in engines + + +class TestBuiltInEnginesRegistration: + """Tests for built-in engine auto-registration.""" + + def test_builtin_engines_registered(self): + """Test that built-in engines are registered on import.""" + # Re-import to ensure registration happens + from python_redlines import registry + + engines = EngineRegistry.list_engines() + assert "openxml-powertools" in engines + assert "docxodus" in engines + + def test_openxml_powertools_is_default(self): + """Test that openxml-powertools is the default engine.""" + from python_redlines import registry + + default = EngineRegistry.get_default_engine_name() + assert default == "openxml-powertools"