import config import datetime import json import os import re import sqlite3 import sys current_dir = os.path.dirname(os.path.abspath(__file__)) project_root = config.yars_dir src_path = os.path.join(project_root, "src") sys.path.append(src_path) from yars.yars import YARS from yars.utils import download_image # Initialize the YARS Reddit miner miner = YARS() # Function to scrape subreddit post details and save to JSON def scrape_subreddit_data(subreddit, minimum_score=100, pull_by="day", limit=5): ret = [] print(f"Starting {subreddit} with min score {minimum_score}, by {pull_by}, limit {limit}") empty = dict() try: subreddit_posts = miner.fetch_subreddit_posts( subreddit, limit=limit, category="top", time_filter=pull_by ) for i, post in enumerate(subreddit_posts, 1): score = post.get("score", 0) if score < minimum_score: break post_data = { "permalink": post.get("permalink"), "title": post.get("title", ""), "author": post.get("author", ""), "created_utc": post.get("created_utc", ""), "num_comments": post.get("num_comments", 0), "score": post.get("score", 0), "media_urls" : post.get("media_urls", []), "body": post.get("body", None), } ret.append(post_data) print(f"Finished {subreddit}") return ret except Exception as e: print(f"Error occurred while scraping subreddit: {e}") return ret def save_posts_to_db(data, cursor): if len(data)==0: return upsert = "INSERT INTO post(permalink, subreddit, created_utc, score, media_fetched, post, hidden) VALUES " upsert += ",".join(["(?,?,?,?,?,?,?)"] * len(data)) upsert += " ON CONFLICT(permalink) DO UPDATE SET score=excluded.score, post=excluded.post" binds = [] for post in data: binds.append(post["permalink"]) m = re.search(r"\/r\/([a-zA-Z0-9_]+)\/.*", post["permalink"]) binds.append(m.group(1)) #subreddit binds.append(post["created_utc"]) binds.append(post["score"]) binds.append(False) binds.append(json.dumps(post)) binds.append(False) cursor.execute(upsert, binds) def download_media(cursor): select = "SELECT post FROM post WHERE media_fetched = ? AND hidden = ?" binds = [False, False] results = cursor.execute(select, binds) post = results.fetchone() binds = [] while post is not None: post = json.loads(post[0]) if len(post["media_urls"])>0: for url in post["media_urls"]: binds.append(post["permalink"]) binds.append(url) path = download_image(url, config.media_dir) binds.append(path) print(f"Downloaded {path}") post = results.fetchone() if len(binds)>0: upsert = "INSERT INTO media(permalink, url, local) VALUES " upsert += ",".join(["(?,?,?)"] * (len(binds)//3)) upsert += " ON CONFLICT(permalink, url) DO UPDATE SET local=excluded.local" cursor.execute(upsert, binds) update = "UPDATE post SET media_fetched = ? WHERE media_fetched = ?" binds = [True, False] cursor.execute(update, binds) def download_comments_for_permalink(permalink, cursor): # Currently unused post_details = miner.scrape_post_details(permalink) update = "UPDATE post SET body = ? WHERE permalink = ?" binds = [post_details["body"], permalink] cursor.execute(update, binds) upsert += "INSERT INTO comments(permalink, comments) VALUES (?, ?) ON CONFLICT(permalink) DO UPDATE SET comments=excluded.comments" binds = [permalink, post_details["comments"]] cursor.execute(upsert, binds) # Main execution if __name__ == "__main__": os.makedirs(config.media_dir, exist_ok=True) connection = sqlite3.connect(config.db_file) cursor = connection.cursor() select = """ SELECT subreddit, minimum_score, fetch_by, fetch_max FROM subreddit """ subreddits = cursor.execute(select).fetchall() for subreddit in subreddits: post_data = scrape_subreddit_data(subreddit[0], subreddit[1], subreddit[2], subreddit[3]) save_posts_to_db(post_data, cursor) connection.commit() download_media(cursor) connection.commit() connection.close()