Important code snippets:
Scraping Instagram Comments with caption:
def get_ig_comments(post_url, username):
L = instaloader.Instaloader()
L.load_session_from_file(username, f"/Users/(username)/.config/instaloader/session-{username}")
shortcode = post_url.split("/")[-2]
post = instaloader.Post.from_shortcode(L.context, shortcode)
caption = post.caption if post.caption else "No caption"
comments_data = []
for count, comment in enumerate(post.get_comments()):
comments_data.append((comment.owner.username, comment.text,caption))
if (count + 1) % 100 == 0:
time.sleep(60)
return comments_data
Results of this script:
Fetches Post Captions, Scrapes Comments by automatically pauses after every 100 comments to comply with Instagram’s API rate limits and saves the extracted data in a structured CSV file for easy analysis.
Extracting Text from Images (OCR):
def process_image_for_description(image_path):
"""Extract and clean text from image using Tesseract OCR."""
image = cv2.imread(image_path)
if image is None:
return "Error: Unable to load image."
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
_, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY)
inverted = cv2.bitwise_not(thresh)
custom_config = r'--oem 3 --psm 6'
raw_text = pytesseract.image_to_string(inverted, config=custom_config)
return raw_text if raw_text.strip() else "No readable text found in the image."
Conversion of Image from text using BLIP:
def process_image_for_blip_description(image_path):
"""Generate a caption for the image using the BLIP model."""
try:
image = Image.open(image_path)
except Exception as e:
return f"Error: Unable to load image. {str(e)}"
inputs = processor(images=image, return_tensors="pt")
with torch.no_grad():
out = caption_model.generate(**inputs)
description = processor.decode(out[0], skip_special_tokens=True)
return description