Also use AI for the content

2026-04-04 20:56:44 +02:00
parent 1c719f4381
commit d32c696f6e
2 changed files with 65 additions and 1 deletions
@@ -23,7 +23,7 @@ from extractors.blog_extractor import BlogExtractor
 from extractors.instagram_extractor import InstagramExtractor
 from obsidian_writer import ObsidianWriter
 from config import Config
-from summarizer import summarize_text, SummarizationError
+from summarizer import summarize_text, SummarizationError, format_markdown_content
 def detect_source_type(url: str) -> str:
@@ -103,6 +103,12 @@ def main():
        print("❌ No content could be extracted")
        sys.exit(1)
    if content.get("content"):
        try:
            content["content"] = format_markdown_content(content["content"])
        except SummarizationError as e:
            print(f"⚠️  Content formatting failed: {e}")
    # Generate AI summary + key points
    if args.summarize or Config.GENERATE_SUMMARY:
        source_text = "\n\n".join(
@@ -82,3 +82,61 @@ def summarize_text(text: str, max_points: int = 3) -> Dict[str, List[str] | str]
        return {"summary": summary, "key_points": key_points}
    except Exception as exc:
        raise SummarizationError(f"Invalid response format: {exc}") from exc
 def format_markdown_content(text: str) -> str:
    """
    Clean and format social content into sensible markdown.
    - Remove excessive emojis/icons
    - Convert list-like lines into ordered/bulleted lists
    - Remove obvious ads/sponsor lines
    - Normalize whitespace
    """
    if not text or not text.strip():
        return ""
    if not Config.OPENAI_API_KEY:
        raise SummarizationError("OPENAI_API_KEY is not set")
    payload = {
        "model": Config.OPENAI_MODEL,
        "messages": [
            {
                "role": "system",
                "content": (
                    "You are a precise formatter. Return only cleaned markdown text. "
                    "Remove ads/sponsor lines, collapse excessive whitespace, "
                    "and replace emoji-heavy bullets with normal bullet/numbered lists. "
                    "Do not add a title or extra sections."
                ),
            },
            {
                "role": "user",
                "content": (
                    "Format the following content:\n\n"
                    f"{text}"
                ),
            },
        ],
        "temperature": 0.1,
        "max_tokens": 800,
    }
    headers = {
        "Authorization": f"Bearer {Config.OPENAI_API_KEY}",
        "Content-Type": "application/json",
    }
    try:
        response = requests.post(
            Config.OPENAI_URL,
            headers=headers,
            json=payload,
            timeout=Config.OPENAI_TIMEOUT,
        )
        response.raise_for_status()
        data = response.json()
        return data["choices"][0]["message"]["content"].strip()
    except Exception as exc:
        raise SummarizationError(f"Formatting request failed: {exc}") from exc