Files
transkripte/tests/test_transcript_pipeline.py
2026-04-15 00:01:38 +02:00

312 lines
11 KiB
Python

from __future__ import annotations
import tempfile
import unittest
from unittest import mock
from datetime import UTC, datetime
from pathlib import Path
from transcript_pipeline import (
AudioMetadata,
Settings,
TranscriptPipeline,
build_frontmatter,
format_note_date,
format_duration,
join_remote_path,
remove_blank_lines,
strip_leading_h1,
short_hash,
slugify_title,
split_text_into_chunks,
)
class UtilityTests(unittest.TestCase):
def test_slugify_title(self) -> None:
self.assertEqual(slugify_title(" 2026/04: Elternabend? "), "2026 04 Elternabend")
def test_format_duration(self) -> None:
self.assertEqual(format_duration(59), "0:59")
self.assertEqual(format_duration(3661), "1:01:01")
def test_format_note_date(self) -> None:
self.assertEqual(format_note_date(datetime(2026, 4, 9, 18, 0, tzinfo=UTC)), "260409")
def test_split_text_into_chunks(self) -> None:
text = "\n\n".join([f"Paragraph {index} " + ("x" * 50) for index in range(10)])
chunks = split_text_into_chunks(text, target_chars=160)
self.assertGreater(len(chunks), 1)
self.assertTrue(all(chunk.strip() for chunk in chunks))
def test_build_frontmatter(self) -> None:
frontmatter = build_frontmatter(
{
"title": "Test",
"duration_seconds": 12.5,
"tags": ["transkript", "ki-zusammenfassung"],
}
)
self.assertIn('title: "Test"', frontmatter)
self.assertIn("duration_seconds: 12.5", frontmatter)
self.assertIn(' - "transkript"', frontmatter)
def test_remove_blank_lines(self) -> None:
text = "a\n\nb\n \n\nc\n"
self.assertEqual(remove_blank_lines(text), "a\nb\nc\n")
def test_strip_leading_h1(self) -> None:
text = "# Titel\n## Abschnitt\nText\n"
self.assertEqual(strip_leading_h1(text, "Titel"), "## Abschnitt\nText")
def test_join_remote_path(self) -> None:
self.assertEqual(
join_remote_path("transkripte:/", "abc123", "audio", "file.m4a"),
"transkripte:/abc123/audio/file.m4a",
)
class NotePathTests(unittest.TestCase):
def setUp(self) -> None:
self.tempdir = tempfile.TemporaryDirectory()
self.base_dir = Path(self.tempdir.name)
self.settings = Settings(
base_dir=self.base_dir,
watch_dir=self.base_dir / "watch",
obsidian_dir=self.base_dir / "vault",
archive_dir=self.base_dir / "archive",
memos_enabled=True,
memos_site_url="https://memos.maddin.app",
memos_content_dir=self.base_dir / "memos-content",
memos_quartz_dir=self.base_dir / "memos-quartz",
memos_output_dir=self.base_dir / "memos-site",
memos_build_command="true",
memos_rclone_remote=None,
memos_rclone_excludes=(),
memos_sync_htpasswd=False,
memos_remote_htpasswd_path=None,
memos_basic_auth_user="maddin",
memos_basic_auth_password="secret",
memos_basic_auth_htpasswd_path=self.base_dir / "deploy/nginx/memos.htpasswd",
prompt_path=self.base_dir / "prompt.md",
state_db_path=self.base_dir / "state.sqlite3",
log_path=self.base_dir / "pipeline.log",
openai_api_key="test-key",
openai_model="test-model",
debounce_seconds=1,
retention_days=7,
request_timeout_seconds=30,
ffprobe_bin="/opt/homebrew/bin/ffprobe",
fswatch_bin="/opt/homebrew/bin/fswatch",
rclone_bin="/opt/homebrew/bin/rclone",
rclone_remote="transkripte:/",
ntfy_base_url="https://ntfy.maddin.app",
ntfy_topic="Transkript",
ntfy_access_token=None,
)
self.settings.ensure_directories()
self.settings.prompt_path.write_text("Prompt", encoding="utf-8")
self.settings.memos_quartz_dir.mkdir(parents=True, exist_ok=True)
self.pipeline = TranscriptPipeline(self.settings)
def tearDown(self) -> None:
self.pipeline.close()
self.tempdir.cleanup()
def test_unique_note_path_uses_existing_source_file(self) -> None:
source_id = short_hash("source")
existing = self.settings.obsidian_dir / "260409 Elternabend.md"
existing.write_text(
f'---\nsource_id: "{source_id}"\n---\n# Test\n',
encoding="utf-8",
)
target = self.pipeline.build_note_target(
source_id=source_id,
title="Elternabend",
recorded_at=datetime(2026, 4, 9, 18, 0, tzinfo=UTC),
note_type="summary",
)
self.assertEqual(target.note_path, existing)
def test_unique_note_path_adds_suffix_for_foreign_note(self) -> None:
source_id = short_hash("source-a")
foreign_id = short_hash("source-b")
existing = self.settings.obsidian_dir / "260409 Elternabend.md"
existing.write_text(
f'---\nsource_id: "{foreign_id}"\n---\n# Other\n',
encoding="utf-8",
)
target = self.pipeline.build_note_target(
source_id=source_id,
title="Elternabend",
recorded_at=datetime(2026, 4, 9, 18, 0, tzinfo=UTC),
note_type="summary",
)
self.assertEqual(target.note_path.name, "260409 Elternabend-2.md")
def test_raw_transcript_note_uses_suffix(self) -> None:
source_id = short_hash("source-raw")
target = self.pipeline.build_note_target(
source_id=source_id,
title="Elternabend",
recorded_at=datetime(2026, 4, 9, 18, 0, tzinfo=UTC),
note_type="raw_transcript",
)
self.assertEqual(target.note_path.name, "260409 Elternabend - Transkript.md")
def test_build_source_id_is_stable(self) -> None:
metadata = AudioMetadata(
recorded_at=datetime(2026, 4, 9, 18, 0, tzinfo=UTC),
recorded_at_source="ffprobe.creation_time",
duration_seconds=120.0,
duration_human="2:00",
audio_size=1024,
)
source_id_a = self.pipeline.build_source_id(
type("Pair", (), {"basename": "Meeting"})(),
metadata,
)
source_id_b = self.pipeline.build_source_id(
type("Pair", (), {"basename": "Meeting"})(),
metadata,
)
self.assertEqual(source_id_a, source_id_b)
def test_process_available_pairs_marks_unstable_scan_for_retry(self) -> None:
pair = type("Pair", (), {"basename": "Meeting"})()
self.pipeline.scan_pairs = lambda: [pair] # type: ignore[method-assign]
self.pipeline.files_stable = lambda _: False # type: ignore[method-assign]
self.pipeline.process_pair = lambda _: self.fail("process_pair should not run") # type: ignore[method-assign]
processed = self.pipeline.process_available_pairs()
self.assertEqual(processed, 0)
self.assertTrue(self.pipeline.last_scan_had_unstable)
def test_sync_memos_site_exports_public_note_without_audio_sources(self) -> None:
note_path = self.settings.obsidian_dir / "260413 Testmemo.md"
note_path.write_text(
"""---
title: "Testmemo"
type: "summary"
date: "2026-04-13"
recorded_at: "2026-04-13T12:00:00+00:00"
duration_human: "5:00"
source_id: "abc123"
source_audio_cache: "/tmp/audio.m4a"
remote_audio: "transkripte:/abc123/audio/test.m4a"
remote_audio_status: "uploaded"
processed_at: "2026-04-13T12:10:00+00:00"
updated_at: "2026-04-13T12:10:00+00:00"
tags:
- "transkript"
- "ki-zusammenfassung"
---
## Metadaten
- Quelle: `Test`
## Zusammenfassung
Kurzfassung
## Transkript
Hallo Welt
## Quellen
- Remote-Audio: `transkripte:/abc123/audio/test.m4a`
- Upload-Status: `uploaded`
""",
encoding="utf-8",
)
exported_count = self.pipeline.sync_memos_site()
self.assertEqual(exported_count, 1)
exported_path = self.settings.memos_content_dir / "transkripte" / note_path.name
exported_text = exported_path.read_text(encoding="utf-8")
self.assertIn('title: "Testmemo"', exported_text)
self.assertIn("## Transkript", exported_text)
self.assertNotIn("remote_audio", exported_text)
self.assertNotIn("## Quellen", exported_text)
self.assertNotIn("Upload-Status", exported_text)
self.assertTrue((self.settings.memos_content_dir / "index.md").exists())
def test_sync_memos_site_writes_paginated_index_pages(self) -> None:
for index in range(25):
note_path = self.settings.obsidian_dir / f"2604{index:02d} Memo {index:02d}.md"
note_path.write_text(
f"""---
title: "Memo {index:02d}"
type: "summary"
date: "2026-04-{(index % 28) + 1:02d}"
recorded_at: "2026-04-{(index % 28) + 1:02d}T12:00:00+00:00"
source_id: "src-{index:02d}"
tags:
- "transkript"
---
## Zusammenfassung
Memo {index:02d}
""",
encoding="utf-8",
)
self.pipeline.sync_memos_site()
index_text = (self.settings.memos_content_dir / "index.md").read_text(encoding="utf-8")
page_two_text = (self.settings.memos_content_dir / "seite-2.md").read_text(encoding="utf-8")
self.assertIn("[[seite-2|Ältere Memos]]", index_text)
self.assertIn("[[index|Neuere Memos]]", page_two_text)
self.assertIn("## Übersicht Seite 2", page_two_text)
def test_ensure_pretty_urls_creates_directory_index_copies(self) -> None:
assert self.pipeline.memos_publisher is not None
html_path = self.settings.memos_output_dir / "seite-2.html"
html_path.parent.mkdir(parents=True, exist_ok=True)
html_path.write_text("<html>Page 2</html>", encoding="utf-8")
self.pipeline.memos_publisher.ensure_pretty_urls()
pretty_path = self.settings.memos_output_dir / "seite-2" / "index.html"
self.assertTrue(pretty_path.exists())
self.assertEqual(pretty_path.read_text(encoding="utf-8"), "<html>Page 2</html>")
@mock.patch("transcript_pipeline.subprocess.run")
def test_memos_deploy_uses_rclone_and_optionally_htpasswd(self, run_mock: mock.Mock) -> None:
assert self.pipeline.memos_publisher is not None
self.settings.memos_rclone_remote = "mln:/home/maddin/transkripte/site"
self.settings.memos_rclone_excludes = ("README.md", ".gitignore")
self.settings.memos_sync_htpasswd = True
self.settings.memos_remote_htpasswd_path = "mln:/home/maddin/transkripte/.htpasswd"
self.settings.memos_basic_auth_htpasswd_path.write_text("maddin:hash\n", encoding="utf-8")
self.pipeline.memos_publisher.deploy_site()
commands = [call.args[0] for call in run_mock.call_args_list]
self.assertIn(
[
"/opt/homebrew/bin/rclone",
"sync",
"--delete-after",
"--fast-list",
"--exclude",
"README.md",
"--exclude",
".gitignore",
str(self.settings.memos_output_dir),
"mln:/home/maddin/transkripte/site",
],
commands,
)
self.assertIn(
[
"/opt/homebrew/bin/rclone",
"copyto",
str(self.settings.memos_basic_auth_htpasswd_path),
"mln:/home/maddin/transkripte/.htpasswd",
],
commands,
)
if __name__ == "__main__":
unittest.main()