diff --git a/extractors/instagram_extractor.py b/extractors/instagram_extractor.py index 776b4db..f017482 100644 --- a/extractors/instagram_extractor.py +++ b/extractors/instagram_extractor.py @@ -100,6 +100,13 @@ class InstagramExtractor: "allow the use of cookies", "use of cookies", "cookies and similar technologies", + "cookies policy", + "cookie preferences", + "learn more about cookies", + "review or change your choices", + "essential cookies", + "optional cookies", + "cookies from other companies", "meta products", "safer experience", "information we receive from cookies", @@ -163,7 +170,9 @@ class InstagramExtractor: if buffer: block = "\n".join(buffer) if not _looks_like_language_list(block): - cleaned_lines.extend(buffer) + cleaned_lines.extend( + [line for line in buffer if not _looks_like_ui_prompt(line)] + ) buffer.clear() for line in lines: @@ -171,6 +180,8 @@ class InstagramExtractor: if not stripped: flush_buffer() continue + if _looks_like_ui_prompt(stripped): + continue if len(stripped) <= 24: buffer.append(stripped) else: