From d32c696f6ee32feb1f2d94977af0b226e2e9c886 Mon Sep 17 00:00:00 2001 From: Jan Bader Date: Sat, 4 Apr 2026 20:56:44 +0200 Subject: [PATCH] Also use AI for the content --- main.py | 8 ++++++- summarizer.py | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 1 deletion(-) diff --git a/main.py b/main.py index 2bad7e9..19ce12d 100644 --- a/main.py +++ b/main.py @@ -23,7 +23,7 @@ from extractors.blog_extractor import BlogExtractor from extractors.instagram_extractor import InstagramExtractor from obsidian_writer import ObsidianWriter from config import Config -from summarizer import summarize_text, SummarizationError +from summarizer import summarize_text, SummarizationError, format_markdown_content def detect_source_type(url: str) -> str: @@ -103,6 +103,12 @@ def main(): print("❌ No content could be extracted") sys.exit(1) + if content.get("content"): + try: + content["content"] = format_markdown_content(content["content"]) + except SummarizationError as e: + print(f"⚠️ Content formatting failed: {e}") + # Generate AI summary + key points if args.summarize or Config.GENERATE_SUMMARY: source_text = "\n\n".join( diff --git a/summarizer.py b/summarizer.py index 2550f97..84d6c77 100644 --- a/summarizer.py +++ b/summarizer.py @@ -82,3 +82,61 @@ def summarize_text(text: str, max_points: int = 3) -> Dict[str, List[str] | str] return {"summary": summary, "key_points": key_points} except Exception as exc: raise SummarizationError(f"Invalid response format: {exc}") from exc + + +def format_markdown_content(text: str) -> str: + """ + Clean and format social content into sensible markdown. + + - Remove excessive emojis/icons + - Convert list-like lines into ordered/bulleted lists + - Remove obvious ads/sponsor lines + - Normalize whitespace + """ + if not text or not text.strip(): + return "" + + if not Config.OPENAI_API_KEY: + raise SummarizationError("OPENAI_API_KEY is not set") + + payload = { + "model": Config.OPENAI_MODEL, + "messages": [ + { + "role": "system", + "content": ( + "You are a precise formatter. Return only cleaned markdown text. " + "Remove ads/sponsor lines, collapse excessive whitespace, " + "and replace emoji-heavy bullets with normal bullet/numbered lists. " + "Do not add a title or extra sections." + ), + }, + { + "role": "user", + "content": ( + "Format the following content:\n\n" + f"{text}" + ), + }, + ], + "temperature": 0.1, + "max_tokens": 800, + } + + headers = { + "Authorization": f"Bearer {Config.OPENAI_API_KEY}", + "Content-Type": "application/json", + } + + try: + response = requests.post( + Config.OPENAI_URL, + headers=headers, + json=payload, + timeout=Config.OPENAI_TIMEOUT, + ) + response.raise_for_status() + data = response.json() + return data["choices"][0]["message"]["content"].strip() + except Exception as exc: + raise SummarizationError(f"Formatting request failed: {exc}") from exc