Ignore language list
This commit is contained in:
@@ -24,15 +24,15 @@ except ImportError:
|
|||||||
|
|
||||||
class InstagramExtractor:
|
class InstagramExtractor:
|
||||||
"""Extract content from Instagram reels."""
|
"""Extract content from Instagram reels."""
|
||||||
|
|
||||||
def __init__(self, url: str, headless: bool = True):
|
def __init__(self, url: str, headless: bool = True):
|
||||||
self.url = url
|
self.url = url
|
||||||
self.headless = headless
|
self.headless = headless
|
||||||
self.data = {}
|
self.data = {}
|
||||||
|
|
||||||
if sync_playwright is None:
|
if sync_playwright is None:
|
||||||
raise ImportError("playwright not installed. Run: pip install playwright && playwright install")
|
raise ImportError("playwright not installed. Run: pip install playwright && playwright install")
|
||||||
|
|
||||||
def extract(self) -> Dict[str, Any]:
|
def extract(self) -> Dict[str, Any]:
|
||||||
"""Extract content from Instagram reel."""
|
"""Extract content from Instagram reel."""
|
||||||
try:
|
try:
|
||||||
@@ -41,28 +41,28 @@ class InstagramExtractor:
|
|||||||
page = browser.new_page(
|
page = browser.new_page(
|
||||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Navigate to the reel
|
# Navigate to the reel
|
||||||
print(f"📱 Loading Instagram reel...")
|
print(f"📱 Loading Instagram reel...")
|
||||||
page.goto(self.url, timeout=30000)
|
page.goto(self.url, timeout=30000)
|
||||||
|
|
||||||
# Wait for content to load
|
# Wait for content to load
|
||||||
time.sleep(3)
|
time.sleep(3)
|
||||||
|
|
||||||
# Try to close any cookies/login prompts
|
# Try to close any cookies/login prompts
|
||||||
try:
|
try:
|
||||||
page.click('button:has-text("Not Now")', timeout=3000)
|
page.click('button:has-text("Not Now")', timeout=3000)
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
try:
|
try:
|
||||||
page.click('button:has-text("Allow")', timeout=3000)
|
page.click('button:has-text("Allow")', timeout=3000)
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Extract data
|
# Extract data
|
||||||
self.data = self._extract_data(page)
|
self.data = self._extract_data(page)
|
||||||
|
|
||||||
browser.close()
|
browser.close()
|
||||||
except PlaywrightTimeout:
|
except PlaywrightTimeout:
|
||||||
print("⚠️ Timeout loading Instagram page")
|
print("⚠️ Timeout loading Instagram page")
|
||||||
@@ -70,9 +70,9 @@ class InstagramExtractor:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"⚠️ Error: {str(e)}")
|
print(f"⚠️ Error: {str(e)}")
|
||||||
self.data = self._fallback_extract()
|
self.data = self._fallback_extract()
|
||||||
|
|
||||||
return self.data
|
return self.data
|
||||||
|
|
||||||
def _extract_data(self, page) -> Dict[str, Any]:
|
def _extract_data(self, page) -> Dict[str, Any]:
|
||||||
"""Extract data from loaded page."""
|
"""Extract data from loaded page."""
|
||||||
data = {
|
data = {
|
||||||
@@ -83,20 +83,30 @@ class InstagramExtractor:
|
|||||||
"key_points": [],
|
"key_points": [],
|
||||||
"tags": ["instagram", "reel"],
|
"tags": ["instagram", "reel"],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def _looks_like_language_list(text: str) -> bool:
|
||||||
|
lines = [line.strip() for line in text.splitlines() if line.strip()]
|
||||||
|
if len(lines) < 8:
|
||||||
|
return False
|
||||||
|
short_lines = [line for line in lines if len(line) <= 20]
|
||||||
|
if len(short_lines) / len(lines) < 0.8:
|
||||||
|
return False
|
||||||
|
single_tokenish = [line for line in short_lines if len(line.split()) <= 2]
|
||||||
|
return len(single_tokenish) / len(lines) > 0.7
|
||||||
|
|
||||||
# Try to get caption/description
|
# Try to get caption/description
|
||||||
try:
|
try:
|
||||||
# Look for caption text
|
# Look for caption text
|
||||||
captions = page.query_selector_all('h1, h2, span')
|
captions = page.query_selector_all('h1, h2, span')
|
||||||
for caption in captions:
|
for caption in captions:
|
||||||
text = caption.inner_text()
|
text = caption.inner_text().strip()
|
||||||
if len(text) > 20 and len(text) < 500:
|
if len(text) > 20 and len(text) < 500 and not _looks_like_language_list(text):
|
||||||
if not data["description"]:
|
if not data["description"]:
|
||||||
data["description"] = text
|
data["description"] = text
|
||||||
break
|
break
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"⚠️ Could not extract caption: {e}")
|
print(f"⚠️ Could not extract caption: {e}")
|
||||||
|
|
||||||
# Try to get author
|
# Try to get author
|
||||||
try:
|
try:
|
||||||
author_elem = page.query_selector('a[href*="/"] h1, a[href*="/"] h2, header span')
|
author_elem = page.query_selector('a[href*="/"] h1, a[href*="/"] h2, header span')
|
||||||
@@ -104,7 +114,7 @@ class InstagramExtractor:
|
|||||||
data["author"] = author_elem.inner_text().strip()
|
data["author"] = author_elem.inner_text().strip()
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Try to get engagement metrics
|
# Try to get engagement metrics
|
||||||
try:
|
try:
|
||||||
likes_elem = page.query_selector('span:has-text("likes"), span:has-text("views")')
|
likes_elem = page.query_selector('span:has-text("likes"), span:has-text("views")')
|
||||||
@@ -112,41 +122,64 @@ class InstagramExtractor:
|
|||||||
data["views"] = likes_elem.inner_text().strip()
|
data["views"] = likes_elem.inner_text().strip()
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Extract any visible text as content
|
# Extract any visible text as content
|
||||||
try:
|
try:
|
||||||
# Get all text content
|
# Get all text content
|
||||||
body_text = page.inner_text('body')
|
body_text = page.inner_text('body')
|
||||||
|
|
||||||
# Filter for meaningful content
|
# Filter for meaningful content
|
||||||
lines = body_text.split('\n')
|
lines = body_text.split('\n')
|
||||||
|
cleaned_lines = []
|
||||||
|
buffer = []
|
||||||
|
|
||||||
|
def flush_buffer():
|
||||||
|
if buffer:
|
||||||
|
block = "\n".join(buffer)
|
||||||
|
if not _looks_like_language_list(block):
|
||||||
|
cleaned_lines.extend(buffer)
|
||||||
|
buffer.clear()
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
stripped = line.strip()
|
||||||
|
if not stripped:
|
||||||
|
flush_buffer()
|
||||||
|
continue
|
||||||
|
if len(stripped) <= 24:
|
||||||
|
buffer.append(stripped)
|
||||||
|
else:
|
||||||
|
flush_buffer()
|
||||||
|
cleaned_lines.append(stripped)
|
||||||
|
|
||||||
|
flush_buffer()
|
||||||
|
|
||||||
meaningful_lines = [
|
meaningful_lines = [
|
||||||
line.strip() for line in lines
|
line for line in cleaned_lines
|
||||||
if len(line.strip()) > 30 and len(line.strip()) < 300
|
if len(line) > 30 and len(line) < 300
|
||||||
]
|
]
|
||||||
|
|
||||||
data["content"] = "\n\n".join(meaningful_lines[:10])[:5000]
|
data["content"] = "\n\n".join(meaningful_lines[:10])[:5000]
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"⚠️ Could not extract page text: {e}")
|
print(f"⚠️ Could not extract page text: {e}")
|
||||||
|
|
||||||
# Generate key points from description
|
# Generate key points from description
|
||||||
if data["description"]:
|
if data["description"]:
|
||||||
sentences = data["description"].split('.')[:3]
|
sentences = data["description"].split('.')[:3]
|
||||||
data["key_points"] = [s.strip() + '.' for s in sentences if len(s.strip()) > 20]
|
data["key_points"] = [s.strip() + '.' for s in sentences if len(s.strip()) > 20]
|
||||||
|
|
||||||
# Add URL-based tags
|
# Add URL-based tags
|
||||||
parsed = urlparse(self.url)
|
parsed = urlparse(self.url)
|
||||||
if '/reel/' in parsed.path:
|
if '/reel/' in parsed.path:
|
||||||
data["tags"].append("reel")
|
data["tags"].append("reel")
|
||||||
if '/video/' in parsed.path:
|
if '/video/' in parsed.path:
|
||||||
data["tags"].append("video")
|
data["tags"].append("video")
|
||||||
|
|
||||||
return data
|
return data
|
||||||
|
|
||||||
def _fallback_extract(self) -> Dict[str, Any]:
|
def _fallback_extract(self) -> Dict[str, Any]:
|
||||||
"""Fallback extraction when browser automation fails."""
|
"""Fallback extraction when browser automation fails."""
|
||||||
print("⚠️ Using fallback extraction method...")
|
print("⚠️ Using fallback extraction method...")
|
||||||
|
|
||||||
# Try to extract what we can from the URL itself
|
# Try to extract what we can from the URL itself
|
||||||
data = {
|
data = {
|
||||||
"title": "Instagram Content",
|
"title": "Instagram Content",
|
||||||
@@ -159,7 +192,7 @@ class InstagramExtractor:
|
|||||||
],
|
],
|
||||||
"tags": ["instagram", "fallback"],
|
"tags": ["instagram", "fallback"],
|
||||||
}
|
}
|
||||||
|
|
||||||
# Extract reel ID from URL
|
# Extract reel ID from URL
|
||||||
try:
|
try:
|
||||||
parsed = urlparse(self.url)
|
parsed = urlparse(self.url)
|
||||||
@@ -171,5 +204,5 @@ class InstagramExtractor:
|
|||||||
break
|
break
|
||||||
except:
|
except:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
return data
|
return data
|
||||||
|
|||||||
Reference in New Issue
Block a user