import config import datetime import json import os import re import sqlite3 import sys current_dir = os.path.dirname(os.path.abspath(__file__)) project_root = config.yars_dir src_path = os.path.join(project_root, "src") sys.path.append(src_path) from yars.yars import YARS from yars.utils import download_image # Initialize the YARS Reddit miner miner = YARS() # Function to scrape subreddit post details and save to JSON def scrape_subreddit_data(subreddit, minimum_score=100, pull_by="day", limit=5, blocked_subs=[], blocked_users=[], blocked_words=[]): ret = [] print(f"Starting {subreddit}") empty = dict() try: subreddit_posts = miner.fetch_subreddit_posts( subreddit, limit=limit, category="top", time_filter=pull_by ) for i, post in enumerate(subreddit_posts, 1): score = post.get("score", 0) if score < minimum_score: break post_data = { "permalink": post.get("permalink"), "subreddit": post.get("subreddit"), "title": post.get("title", ""), "author": post.get("author", ""), "created_utc": post.get("created_utc", ""), "num_comments": post.get("num_comments", 0), "score": post.get("score", 0), "media_urls" : post.get("media_urls", []), "body": post.get("body", None), } if post_data["subreddit"] in blocked_subs: continue if post_data["author"] in blocked_users: continue for word in blocked_words: if word in post_data["title"]: break if post_data["body"] is not None and word in post_data["body"]: break else: ret.append(post_data) print(f"Finished {subreddit}") return ret except Exception as e: print(f"Error occurred while scraping subreddit: {e}") return ret def save_posts_to_db(data, cursor): if len(data)==0: return upsert = "INSERT INTO post(permalink, subreddit, created_utc, score, media_fetched, post, hidden, saved, author) VALUES " upsert += ",".join(["(?,?,?,?,?,?,?,?,?)"] * len(data)) upsert += " ON CONFLICT(permalink) DO UPDATE SET score=excluded.score, post=excluded.post" binds = [] for post in data: binds.append(post["permalink"]) binds.append(post["subreddit"]) binds.append(post["created_utc"]) binds.append(post["score"]) binds.append(False) binds.append(json.dumps(post)) binds.append(False) binds.append(False) binds.append(post["author"]) cursor.execute(upsert, binds) def download_media(cursor): select = "SELECT post FROM post WHERE media_fetched = ? AND hidden = ?" binds = [False, False] results = cursor.execute(select, binds) post = results.fetchone() downloads = {} binds = [] while post is not None: post = json.loads(post[0]) if len(post["media_urls"])>0: for url in post["media_urls"]: binds.append(post["permalink"]) binds.append(url) if url not in downloads: downloads[url] = download_image(url, config.media_dir) path = downloads[url] binds.append(path) print(f"Downloaded {path}") post = results.fetchone() if len(binds)>0: upsert = "INSERT INTO media(permalink, url, local) VALUES " upsert += ",".join(["(?,?,?)"] * (len(binds)//3)) upsert += " ON CONFLICT(permalink, url) DO UPDATE SET local=excluded.local" cursor.execute(upsert, binds) update = "UPDATE post SET media_fetched = ? WHERE media_fetched = ?" binds = [True, False] cursor.execute(update, binds) def download_comments_for_permalink(permalink, cursor): # Currently unused post_details = miner.scrape_post_details(permalink) update = "UPDATE post SET body = ? WHERE permalink = ?" binds = [post_details["body"], permalink] cursor.execute(update, binds) upsert += "INSERT INTO comments(permalink, comments) VALUES (?, ?) ON CONFLICT(permalink) DO UPDATE SET comments=excluded.comments" binds = [permalink, post_details["comments"]] cursor.execute(upsert, binds) def get_blocks(cursor): select = """ SELECT name FROM block """ blocks = [row[0] for row in cursor.execute(select)] subs = [] users = [] words = [] for block in blocks: if "/r/" in block: subs.append(block[3:]) elif "/u/" in block: users.append(block[3:]) else: words.append(block) return subs, users, words # Main execution if __name__ == "__main__": os.makedirs(config.media_dir, exist_ok=True) connection = sqlite3.connect(config.db_file) cursor = connection.cursor() select = """ SELECT subreddit, minimum_score, fetch_by, fetch_max FROM subreddit """ subreddits = cursor.execute(select).fetchall() blocked_subs, blocked_users, blocked_words = get_blocks(cursor) for subreddit in subreddits: post_data = scrape_subreddit_data(subreddit[0], subreddit[1], subreddit[2], subreddit[3], blocked_subs, blocked_users, blocked_words) save_posts_to_db(post_data, cursor) connection.commit() download_media(cursor) connection.commit() connection.close()