From 684f7c87e6c872a49d8d7202cf9b1dd28c989936 Mon Sep 17 00:00:00 2001 From: Jan Bader Date: Sat, 4 Apr 2026 21:00:56 +0200 Subject: [PATCH] Harder ignoring of ui prompts --- extractors/instagram_extractor.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/extractors/instagram_extractor.py b/extractors/instagram_extractor.py index 776b4db..f017482 100644 --- a/extractors/instagram_extractor.py +++ b/extractors/instagram_extractor.py @@ -100,6 +100,13 @@ class InstagramExtractor: "allow the use of cookies", "use of cookies", "cookies and similar technologies", + "cookies policy", + "cookie preferences", + "learn more about cookies", + "review or change your choices", + "essential cookies", + "optional cookies", + "cookies from other companies", "meta products", "safer experience", "information we receive from cookies", @@ -163,7 +170,9 @@ class InstagramExtractor: if buffer: block = "\n".join(buffer) if not _looks_like_language_list(block): - cleaned_lines.extend(buffer) + cleaned_lines.extend( + [line for line in buffer if not _looks_like_ui_prompt(line)] + ) buffer.clear() for line in lines: @@ -171,6 +180,8 @@ class InstagramExtractor: if not stripped: flush_buffer() continue + if _looks_like_ui_prompt(stripped): + continue if len(stripped) <= 24: buffer.append(stripped) else: