Files
backblaze-invoices-downloader/summarizer.py
2026-04-04 21:10:33 +02:00

173 lines
5.3 KiB
Python

"""
OpenAI/OpenRouter summarizer utility.
Uses OPENAI_API_KEY and OPENAI_URL from environment (via Config).
"""
from __future__ import annotations
import json
import logging
from typing import Dict, List
import requests
from config import Config
class SummarizationError(RuntimeError):
"""Raised when summarization fails."""
logger = logging.getLogger(__name__)
def summarize_text(text: str, max_points: int = 3) -> Dict[str, List[str] | str]:
"""
Summarize text into a short summary and key points.
Returns:
{
"summary": "string",
"key_points": ["point 1", "point 2", ...]
}
"""
if not text or not text.strip():
return {"summary": "", "key_points": []}
if not Config.OPENAI_API_KEY:
raise SummarizationError("OPENAI_API_KEY is not set")
payload = {
"model": Config.OPENAI_MODEL,
"messages": [
{
"role": "system",
"content": (
"You are a precise summarizer. Return JSON only with keys "
"`summary` and `key_points` (array of strings). Do not add extra keys."
),
},
{
"role": "user",
"content": (
"Summarize the following content in 2-4 sentences and provide "
f"{max_points} key points.\n\n"
f"CONTENT:\n{text}"
),
},
],
"temperature": 0.2,
"max_tokens": 400,
}
headers = {
"Authorization": f"Bearer {Config.OPENAI_API_KEY}",
"Content-Type": "application/json",
}
try:
logger.info(
"OpenAI request: url=%s model=%s timeout=%ss input_chars=%s",
Config.OPENAI_URL,
Config.OPENAI_MODEL,
Config.OPENAI_TIMEOUT,
len(text),
)
if Config.OPENAI_LOG_PAYLOAD:
logger.debug("OpenAI request payload: %s", json.dumps(payload, ensure_ascii=False))
response = requests.post(
Config.OPENAI_URL,
headers=headers,
json=payload,
timeout=Config.OPENAI_TIMEOUT,
)
logger.info("OpenAI response: status=%s", response.status_code)
if Config.OPENAI_LOG_PAYLOAD:
logger.debug("OpenAI response body: %s", response.text)
response.raise_for_status()
data = response.json()
except Exception as exc:
logger.exception("OpenAI request failed")
raise SummarizationError(f"Request failed: {exc}") from exc
try:
content = data["choices"][0]["message"]["content"].strip()
result = json.loads(content)
summary = result.get("summary", "").strip()
key_points = [p.strip() for p in result.get("key_points", []) if p.strip()]
return {"summary": summary, "key_points": key_points}
except Exception as exc:
raise SummarizationError(f"Invalid response format: {exc}") from exc
def format_markdown_content(text: str) -> str:
"""
Clean and format social content into sensible markdown.
- Remove excessive emojis/icons
- Convert list-like lines into ordered/bulleted lists
- Remove obvious ads/sponsor lines
- Normalize whitespace
"""
if not text or not text.strip():
return ""
if not Config.OPENAI_API_KEY:
raise SummarizationError("OPENAI_API_KEY is not set")
payload = {
"model": Config.OPENAI_MODEL,
"messages": [
{
"role": "system",
"content": (
"You are a precise formatter. Return only cleaned markdown text. "
"Remove ads/sponsor lines, collapse excessive whitespace, "
"and replace emoji-heavy bullets with normal bullet/numbered lists. "
"Do not add a title or extra sections."
),
},
{
"role": "user",
"content": (
"Format the following content:\n\n"
f"{text}"
),
},
],
"temperature": 0.1,
"max_tokens": 800,
}
headers = {
"Authorization": f"Bearer {Config.OPENAI_API_KEY}",
"Content-Type": "application/json",
}
try:
logger.info(
"OpenAI format request: url=%s model=%s timeout=%ss input_chars=%s",
Config.OPENAI_URL,
Config.OPENAI_MODEL,
Config.OPENAI_TIMEOUT,
len(text),
)
if Config.OPENAI_LOG_PAYLOAD:
logger.debug("OpenAI format request payload: %s", json.dumps(payload, ensure_ascii=False))
response = requests.post(
Config.OPENAI_URL,
headers=headers,
json=payload,
timeout=Config.OPENAI_TIMEOUT,
)
logger.info("OpenAI format response: status=%s", response.status_code)
if Config.OPENAI_LOG_PAYLOAD:
logger.debug("OpenAI format response body: %s", response.text)
response.raise_for_status()
data = response.json()
return data["choices"][0]["message"]["content"].strip()
except Exception as exc:
logger.exception("OpenAI format request failed")
raise SummarizationError(f"Formatting request failed: {exc}") from exc