Initial import of transcript pipeline
This commit is contained in:
@@ -0,0 +1,311 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import tempfile
|
||||
import unittest
|
||||
from unittest import mock
|
||||
from datetime import UTC, datetime
|
||||
from pathlib import Path
|
||||
|
||||
from transcript_pipeline import (
|
||||
AudioMetadata,
|
||||
Settings,
|
||||
TranscriptPipeline,
|
||||
build_frontmatter,
|
||||
format_note_date,
|
||||
format_duration,
|
||||
join_remote_path,
|
||||
remove_blank_lines,
|
||||
strip_leading_h1,
|
||||
short_hash,
|
||||
slugify_title,
|
||||
split_text_into_chunks,
|
||||
)
|
||||
|
||||
|
||||
class UtilityTests(unittest.TestCase):
|
||||
def test_slugify_title(self) -> None:
|
||||
self.assertEqual(slugify_title(" 2026/04: Elternabend? "), "2026 04 Elternabend")
|
||||
|
||||
def test_format_duration(self) -> None:
|
||||
self.assertEqual(format_duration(59), "0:59")
|
||||
self.assertEqual(format_duration(3661), "1:01:01")
|
||||
|
||||
def test_format_note_date(self) -> None:
|
||||
self.assertEqual(format_note_date(datetime(2026, 4, 9, 18, 0, tzinfo=UTC)), "260409")
|
||||
|
||||
def test_split_text_into_chunks(self) -> None:
|
||||
text = "\n\n".join([f"Paragraph {index} " + ("x" * 50) for index in range(10)])
|
||||
chunks = split_text_into_chunks(text, target_chars=160)
|
||||
self.assertGreater(len(chunks), 1)
|
||||
self.assertTrue(all(chunk.strip() for chunk in chunks))
|
||||
|
||||
def test_build_frontmatter(self) -> None:
|
||||
frontmatter = build_frontmatter(
|
||||
{
|
||||
"title": "Test",
|
||||
"duration_seconds": 12.5,
|
||||
"tags": ["transkript", "ki-zusammenfassung"],
|
||||
}
|
||||
)
|
||||
self.assertIn('title: "Test"', frontmatter)
|
||||
self.assertIn("duration_seconds: 12.5", frontmatter)
|
||||
self.assertIn(' - "transkript"', frontmatter)
|
||||
|
||||
def test_remove_blank_lines(self) -> None:
|
||||
text = "a\n\nb\n \n\nc\n"
|
||||
self.assertEqual(remove_blank_lines(text), "a\nb\nc\n")
|
||||
|
||||
def test_strip_leading_h1(self) -> None:
|
||||
text = "# Titel\n## Abschnitt\nText\n"
|
||||
self.assertEqual(strip_leading_h1(text, "Titel"), "## Abschnitt\nText")
|
||||
|
||||
def test_join_remote_path(self) -> None:
|
||||
self.assertEqual(
|
||||
join_remote_path("transkripte:/", "abc123", "audio", "file.m4a"),
|
||||
"transkripte:/abc123/audio/file.m4a",
|
||||
)
|
||||
|
||||
|
||||
class NotePathTests(unittest.TestCase):
|
||||
def setUp(self) -> None:
|
||||
self.tempdir = tempfile.TemporaryDirectory()
|
||||
self.base_dir = Path(self.tempdir.name)
|
||||
self.settings = Settings(
|
||||
base_dir=self.base_dir,
|
||||
watch_dir=self.base_dir / "watch",
|
||||
obsidian_dir=self.base_dir / "vault",
|
||||
archive_dir=self.base_dir / "archive",
|
||||
memos_enabled=True,
|
||||
memos_site_url="https://memos.maddin.app",
|
||||
memos_content_dir=self.base_dir / "memos-content",
|
||||
memos_quartz_dir=self.base_dir / "memos-quartz",
|
||||
memos_output_dir=self.base_dir / "memos-site",
|
||||
memos_build_command="true",
|
||||
memos_rclone_remote=None,
|
||||
memos_rclone_excludes=(),
|
||||
memos_sync_htpasswd=False,
|
||||
memos_remote_htpasswd_path=None,
|
||||
memos_basic_auth_user="maddin",
|
||||
memos_basic_auth_password="secret",
|
||||
memos_basic_auth_htpasswd_path=self.base_dir / "deploy/nginx/memos.htpasswd",
|
||||
prompt_path=self.base_dir / "prompt.md",
|
||||
state_db_path=self.base_dir / "state.sqlite3",
|
||||
log_path=self.base_dir / "pipeline.log",
|
||||
openai_api_key="test-key",
|
||||
openai_model="test-model",
|
||||
debounce_seconds=1,
|
||||
retention_days=7,
|
||||
request_timeout_seconds=30,
|
||||
ffprobe_bin="/opt/homebrew/bin/ffprobe",
|
||||
fswatch_bin="/opt/homebrew/bin/fswatch",
|
||||
rclone_bin="/opt/homebrew/bin/rclone",
|
||||
rclone_remote="transkripte:/",
|
||||
ntfy_base_url="https://ntfy.maddin.app",
|
||||
ntfy_topic="Transkript",
|
||||
ntfy_access_token=None,
|
||||
)
|
||||
self.settings.ensure_directories()
|
||||
self.settings.prompt_path.write_text("Prompt", encoding="utf-8")
|
||||
self.settings.memos_quartz_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.pipeline = TranscriptPipeline(self.settings)
|
||||
|
||||
def tearDown(self) -> None:
|
||||
self.pipeline.close()
|
||||
self.tempdir.cleanup()
|
||||
|
||||
def test_unique_note_path_uses_existing_source_file(self) -> None:
|
||||
source_id = short_hash("source")
|
||||
existing = self.settings.obsidian_dir / "260409 Elternabend.md"
|
||||
existing.write_text(
|
||||
f'---\nsource_id: "{source_id}"\n---\n# Test\n',
|
||||
encoding="utf-8",
|
||||
)
|
||||
target = self.pipeline.build_note_target(
|
||||
source_id=source_id,
|
||||
title="Elternabend",
|
||||
recorded_at=datetime(2026, 4, 9, 18, 0, tzinfo=UTC),
|
||||
note_type="summary",
|
||||
)
|
||||
self.assertEqual(target.note_path, existing)
|
||||
|
||||
def test_unique_note_path_adds_suffix_for_foreign_note(self) -> None:
|
||||
source_id = short_hash("source-a")
|
||||
foreign_id = short_hash("source-b")
|
||||
existing = self.settings.obsidian_dir / "260409 Elternabend.md"
|
||||
existing.write_text(
|
||||
f'---\nsource_id: "{foreign_id}"\n---\n# Other\n',
|
||||
encoding="utf-8",
|
||||
)
|
||||
target = self.pipeline.build_note_target(
|
||||
source_id=source_id,
|
||||
title="Elternabend",
|
||||
recorded_at=datetime(2026, 4, 9, 18, 0, tzinfo=UTC),
|
||||
note_type="summary",
|
||||
)
|
||||
self.assertEqual(target.note_path.name, "260409 Elternabend-2.md")
|
||||
|
||||
def test_raw_transcript_note_uses_suffix(self) -> None:
|
||||
source_id = short_hash("source-raw")
|
||||
target = self.pipeline.build_note_target(
|
||||
source_id=source_id,
|
||||
title="Elternabend",
|
||||
recorded_at=datetime(2026, 4, 9, 18, 0, tzinfo=UTC),
|
||||
note_type="raw_transcript",
|
||||
)
|
||||
self.assertEqual(target.note_path.name, "260409 Elternabend - Transkript.md")
|
||||
|
||||
def test_build_source_id_is_stable(self) -> None:
|
||||
metadata = AudioMetadata(
|
||||
recorded_at=datetime(2026, 4, 9, 18, 0, tzinfo=UTC),
|
||||
recorded_at_source="ffprobe.creation_time",
|
||||
duration_seconds=120.0,
|
||||
duration_human="2:00",
|
||||
audio_size=1024,
|
||||
)
|
||||
source_id_a = self.pipeline.build_source_id(
|
||||
type("Pair", (), {"basename": "Meeting"})(),
|
||||
metadata,
|
||||
)
|
||||
source_id_b = self.pipeline.build_source_id(
|
||||
type("Pair", (), {"basename": "Meeting"})(),
|
||||
metadata,
|
||||
)
|
||||
self.assertEqual(source_id_a, source_id_b)
|
||||
|
||||
def test_process_available_pairs_marks_unstable_scan_for_retry(self) -> None:
|
||||
pair = type("Pair", (), {"basename": "Meeting"})()
|
||||
self.pipeline.scan_pairs = lambda: [pair] # type: ignore[method-assign]
|
||||
self.pipeline.files_stable = lambda _: False # type: ignore[method-assign]
|
||||
self.pipeline.process_pair = lambda _: self.fail("process_pair should not run") # type: ignore[method-assign]
|
||||
|
||||
processed = self.pipeline.process_available_pairs()
|
||||
|
||||
self.assertEqual(processed, 0)
|
||||
self.assertTrue(self.pipeline.last_scan_had_unstable)
|
||||
|
||||
def test_sync_memos_site_exports_public_note_without_audio_sources(self) -> None:
|
||||
note_path = self.settings.obsidian_dir / "260413 Testmemo.md"
|
||||
note_path.write_text(
|
||||
"""---
|
||||
title: "Testmemo"
|
||||
type: "summary"
|
||||
date: "2026-04-13"
|
||||
recorded_at: "2026-04-13T12:00:00+00:00"
|
||||
duration_human: "5:00"
|
||||
source_id: "abc123"
|
||||
source_audio_cache: "/tmp/audio.m4a"
|
||||
remote_audio: "transkripte:/abc123/audio/test.m4a"
|
||||
remote_audio_status: "uploaded"
|
||||
processed_at: "2026-04-13T12:10:00+00:00"
|
||||
updated_at: "2026-04-13T12:10:00+00:00"
|
||||
tags:
|
||||
- "transkript"
|
||||
- "ki-zusammenfassung"
|
||||
---
|
||||
## Metadaten
|
||||
- Quelle: `Test`
|
||||
|
||||
## Zusammenfassung
|
||||
Kurzfassung
|
||||
|
||||
## Transkript
|
||||
Hallo Welt
|
||||
|
||||
## Quellen
|
||||
- Remote-Audio: `transkripte:/abc123/audio/test.m4a`
|
||||
- Upload-Status: `uploaded`
|
||||
""",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
exported_count = self.pipeline.sync_memos_site()
|
||||
|
||||
self.assertEqual(exported_count, 1)
|
||||
exported_path = self.settings.memos_content_dir / "transkripte" / note_path.name
|
||||
exported_text = exported_path.read_text(encoding="utf-8")
|
||||
self.assertIn('title: "Testmemo"', exported_text)
|
||||
self.assertIn("## Transkript", exported_text)
|
||||
self.assertNotIn("remote_audio", exported_text)
|
||||
self.assertNotIn("## Quellen", exported_text)
|
||||
self.assertNotIn("Upload-Status", exported_text)
|
||||
self.assertTrue((self.settings.memos_content_dir / "index.md").exists())
|
||||
|
||||
def test_sync_memos_site_writes_paginated_index_pages(self) -> None:
|
||||
for index in range(25):
|
||||
note_path = self.settings.obsidian_dir / f"2604{index:02d} Memo {index:02d}.md"
|
||||
note_path.write_text(
|
||||
f"""---
|
||||
title: "Memo {index:02d}"
|
||||
type: "summary"
|
||||
date: "2026-04-{(index % 28) + 1:02d}"
|
||||
recorded_at: "2026-04-{(index % 28) + 1:02d}T12:00:00+00:00"
|
||||
source_id: "src-{index:02d}"
|
||||
tags:
|
||||
- "transkript"
|
||||
---
|
||||
## Zusammenfassung
|
||||
Memo {index:02d}
|
||||
""",
|
||||
encoding="utf-8",
|
||||
)
|
||||
|
||||
self.pipeline.sync_memos_site()
|
||||
|
||||
index_text = (self.settings.memos_content_dir / "index.md").read_text(encoding="utf-8")
|
||||
page_two_text = (self.settings.memos_content_dir / "seite-2.md").read_text(encoding="utf-8")
|
||||
self.assertIn("[[seite-2|Ältere Memos]]", index_text)
|
||||
self.assertIn("[[index|Neuere Memos]]", page_two_text)
|
||||
self.assertIn("## Übersicht Seite 2", page_two_text)
|
||||
|
||||
def test_ensure_pretty_urls_creates_directory_index_copies(self) -> None:
|
||||
assert self.pipeline.memos_publisher is not None
|
||||
html_path = self.settings.memos_output_dir / "seite-2.html"
|
||||
html_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
html_path.write_text("<html>Page 2</html>", encoding="utf-8")
|
||||
|
||||
self.pipeline.memos_publisher.ensure_pretty_urls()
|
||||
|
||||
pretty_path = self.settings.memos_output_dir / "seite-2" / "index.html"
|
||||
self.assertTrue(pretty_path.exists())
|
||||
self.assertEqual(pretty_path.read_text(encoding="utf-8"), "<html>Page 2</html>")
|
||||
|
||||
@mock.patch("transcript_pipeline.subprocess.run")
|
||||
def test_memos_deploy_uses_rclone_and_optionally_htpasswd(self, run_mock: mock.Mock) -> None:
|
||||
assert self.pipeline.memos_publisher is not None
|
||||
self.settings.memos_rclone_remote = "mln:/home/maddin/transkripte/site"
|
||||
self.settings.memos_rclone_excludes = ("README.md", ".gitignore")
|
||||
self.settings.memos_sync_htpasswd = True
|
||||
self.settings.memos_remote_htpasswd_path = "mln:/home/maddin/transkripte/.htpasswd"
|
||||
self.settings.memos_basic_auth_htpasswd_path.write_text("maddin:hash\n", encoding="utf-8")
|
||||
|
||||
self.pipeline.memos_publisher.deploy_site()
|
||||
|
||||
commands = [call.args[0] for call in run_mock.call_args_list]
|
||||
self.assertIn(
|
||||
[
|
||||
"/opt/homebrew/bin/rclone",
|
||||
"sync",
|
||||
"--delete-after",
|
||||
"--fast-list",
|
||||
"--exclude",
|
||||
"README.md",
|
||||
"--exclude",
|
||||
".gitignore",
|
||||
str(self.settings.memos_output_dir),
|
||||
"mln:/home/maddin/transkripte/site",
|
||||
],
|
||||
commands,
|
||||
)
|
||||
self.assertIn(
|
||||
[
|
||||
"/opt/homebrew/bin/rclone",
|
||||
"copyto",
|
||||
str(self.settings.memos_basic_auth_htpasswd_path),
|
||||
"mln:/home/maddin/transkripte/.htpasswd",
|
||||
],
|
||||
commands,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
Reference in New Issue
Block a user