Also use AI for the content
This commit is contained in:
8
main.py
8
main.py
@@ -23,7 +23,7 @@ from extractors.blog_extractor import BlogExtractor
|
|||||||
from extractors.instagram_extractor import InstagramExtractor
|
from extractors.instagram_extractor import InstagramExtractor
|
||||||
from obsidian_writer import ObsidianWriter
|
from obsidian_writer import ObsidianWriter
|
||||||
from config import Config
|
from config import Config
|
||||||
from summarizer import summarize_text, SummarizationError
|
from summarizer import summarize_text, SummarizationError, format_markdown_content
|
||||||
|
|
||||||
|
|
||||||
def detect_source_type(url: str) -> str:
|
def detect_source_type(url: str) -> str:
|
||||||
@@ -103,6 +103,12 @@ def main():
|
|||||||
print("❌ No content could be extracted")
|
print("❌ No content could be extracted")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
|
if content.get("content"):
|
||||||
|
try:
|
||||||
|
content["content"] = format_markdown_content(content["content"])
|
||||||
|
except SummarizationError as e:
|
||||||
|
print(f"⚠️ Content formatting failed: {e}")
|
||||||
|
|
||||||
# Generate AI summary + key points
|
# Generate AI summary + key points
|
||||||
if args.summarize or Config.GENERATE_SUMMARY:
|
if args.summarize or Config.GENERATE_SUMMARY:
|
||||||
source_text = "\n\n".join(
|
source_text = "\n\n".join(
|
||||||
|
|||||||
@@ -82,3 +82,61 @@ def summarize_text(text: str, max_points: int = 3) -> Dict[str, List[str] | str]
|
|||||||
return {"summary": summary, "key_points": key_points}
|
return {"summary": summary, "key_points": key_points}
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
raise SummarizationError(f"Invalid response format: {exc}") from exc
|
raise SummarizationError(f"Invalid response format: {exc}") from exc
|
||||||
|
|
||||||
|
|
||||||
|
def format_markdown_content(text: str) -> str:
|
||||||
|
"""
|
||||||
|
Clean and format social content into sensible markdown.
|
||||||
|
|
||||||
|
- Remove excessive emojis/icons
|
||||||
|
- Convert list-like lines into ordered/bulleted lists
|
||||||
|
- Remove obvious ads/sponsor lines
|
||||||
|
- Normalize whitespace
|
||||||
|
"""
|
||||||
|
if not text or not text.strip():
|
||||||
|
return ""
|
||||||
|
|
||||||
|
if not Config.OPENAI_API_KEY:
|
||||||
|
raise SummarizationError("OPENAI_API_KEY is not set")
|
||||||
|
|
||||||
|
payload = {
|
||||||
|
"model": Config.OPENAI_MODEL,
|
||||||
|
"messages": [
|
||||||
|
{
|
||||||
|
"role": "system",
|
||||||
|
"content": (
|
||||||
|
"You are a precise formatter. Return only cleaned markdown text. "
|
||||||
|
"Remove ads/sponsor lines, collapse excessive whitespace, "
|
||||||
|
"and replace emoji-heavy bullets with normal bullet/numbered lists. "
|
||||||
|
"Do not add a title or extra sections."
|
||||||
|
),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": (
|
||||||
|
"Format the following content:\n\n"
|
||||||
|
f"{text}"
|
||||||
|
),
|
||||||
|
},
|
||||||
|
],
|
||||||
|
"temperature": 0.1,
|
||||||
|
"max_tokens": 800,
|
||||||
|
}
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
"Authorization": f"Bearer {Config.OPENAI_API_KEY}",
|
||||||
|
"Content-Type": "application/json",
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.post(
|
||||||
|
Config.OPENAI_URL,
|
||||||
|
headers=headers,
|
||||||
|
json=payload,
|
||||||
|
timeout=Config.OPENAI_TIMEOUT,
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
data = response.json()
|
||||||
|
return data["choices"][0]["message"]["content"].strip()
|
||||||
|
except Exception as exc:
|
||||||
|
raise SummarizationError(f"Formatting request failed: {exc}") from exc
|
||||||
|
|||||||
Reference in New Issue
Block a user