Also use AI for the content

This commit is contained in:
Jan Bader
2026-04-04 20:56:44 +02:00
parent 1c719f4381
commit d32c696f6e
2 changed files with 65 additions and 1 deletions

View File

@@ -23,7 +23,7 @@ from extractors.blog_extractor import BlogExtractor
from extractors.instagram_extractor import InstagramExtractor
from obsidian_writer import ObsidianWriter
from config import Config
from summarizer import summarize_text, SummarizationError
from summarizer import summarize_text, SummarizationError, format_markdown_content
def detect_source_type(url: str) -> str:
@@ -103,6 +103,12 @@ def main():
print("❌ No content could be extracted")
sys.exit(1)
if content.get("content"):
try:
content["content"] = format_markdown_content(content["content"])
except SummarizationError as e:
print(f"⚠️ Content formatting failed: {e}")
# Generate AI summary + key points
if args.summarize or Config.GENERATE_SUMMARY:
source_text = "\n\n".join(

View File

@@ -82,3 +82,61 @@ def summarize_text(text: str, max_points: int = 3) -> Dict[str, List[str] | str]
return {"summary": summary, "key_points": key_points}
except Exception as exc:
raise SummarizationError(f"Invalid response format: {exc}") from exc
def format_markdown_content(text: str) -> str:
"""
Clean and format social content into sensible markdown.
- Remove excessive emojis/icons
- Convert list-like lines into ordered/bulleted lists
- Remove obvious ads/sponsor lines
- Normalize whitespace
"""
if not text or not text.strip():
return ""
if not Config.OPENAI_API_KEY:
raise SummarizationError("OPENAI_API_KEY is not set")
payload = {
"model": Config.OPENAI_MODEL,
"messages": [
{
"role": "system",
"content": (
"You are a precise formatter. Return only cleaned markdown text. "
"Remove ads/sponsor lines, collapse excessive whitespace, "
"and replace emoji-heavy bullets with normal bullet/numbered lists. "
"Do not add a title or extra sections."
),
},
{
"role": "user",
"content": (
"Format the following content:\n\n"
f"{text}"
),
},
],
"temperature": 0.1,
"max_tokens": 800,
}
headers = {
"Authorization": f"Bearer {Config.OPENAI_API_KEY}",
"Content-Type": "application/json",
}
try:
response = requests.post(
Config.OPENAI_URL,
headers=headers,
json=payload,
timeout=Config.OPENAI_TIMEOUT,
)
response.raise_for_status()
data = response.json()
return data["choices"][0]["message"]["content"].strip()
except Exception as exc:
raise SummarizationError(f"Formatting request failed: {exc}") from exc