reddit/app/scrape_posts.py

import config
import datetime
import json
import os
import re
import sqlite3
import sys

current_dir = os.path.dirname(os.path.abspath(__file__))
project_root = config.yars_dir
src_path = os.path.join(project_root, "src")
sys.path.append(src_path)

from yars.yars import YARS
from yars.utils import download_image

# Initialize the YARS Reddit miner
miner = YARS()

# Function to scrape subreddit post details and save to JSON
def scrape_subreddit_data(subreddit, limit=5):
    ret = []
    subreddit_name = subreddit[0]
    minimum_score = subreddit[1]
    print(f"Starting {subreddit_name}")
    empty = dict()
    try:
        subreddit_posts = miner.fetch_subreddit_posts(
            subreddit_name, limit=limit, category="top", time_filter=config.pull_by
        )
        for i, post in enumerate(subreddit_posts, 1):
            score = post.get("score", 0)
            if score < minimum_score:
                continue
            post_data = {
                "permalink": post.get("permalink"),
                "title": post.get("title", ""),
                "author": post.get("author", ""),
                "created_utc": post.get("created_utc", ""),
                "num_comments": post.get("num_comments", 0),
                "score": post.get("score", 0),
                "media_urls" : post.get("media_urls", []),
                "body": post.get("body", None),
            }
            ret.append(post_data)
        print(f"Finished {subreddit_name}")
        return ret
    except Exception as e:
        print(f"Error occurred while scraping subreddit: {e}")
        return ret

def save_posts_to_db(data, cursor):
    if len(data)==0:
        return
    upsert = "INSERT INTO post(permalink, subreddit, created_utc, score, media_fetched, post, hidden) VALUES "
    upsert += ",".join(["(?,?,?,?,?,?,?)"] * len(data))
    upsert += " ON CONFLICT(permalink) DO UPDATE SET score=excluded.score, post=excluded.post"
    binds = []
    for post in data:
        binds.append(post["permalink"])
        m = re.search(r"\/r\/([a-zA-Z0-9_]+)\/.*", post["permalink"])
        binds.append(m.group(1)) #subreddit
        binds.append(post["created_utc"])
        binds.append(post["score"])
        binds.append(False)
        binds.append(json.dumps(post))
        binds.append(False)
    cursor.execute(upsert, binds)

def download_media(cursor):
    select = "SELECT post FROM post WHERE media_fetched = ?"
    binds = [False]
    results = cursor.execute(select, binds)
    post = results.fetchone()
    binds = []
    while post is not None:
        post = json.loads(post[0])
        if len(post["media_urls"])>0:
            for url in post["media_urls"]:
                binds.append(post["permalink"])
                binds.append(url)
                binds.append(download_image(url, config.media_dir))
                print("image downloaded")
        post = results.fetchone()

    if len(binds)>0:
        upsert = "INSERT INTO media(permalink, url, local) VALUES "
        upsert += ",".join(["(?,?,?)"] * (len(binds)//3))
        upsert += " ON CONFLICT(permalink, url) DO UPDATE SET local=excluded.local"
        cursor.execute(upsert, binds)

    update = "UPDATE post SET media_fetched = ? WHERE media_fetched = ?"
    binds = [True, False]
    cursor.execute(update, binds)

def download_comments_for_permalink(permalink, cursor):
    # Currently unused
    post_details = miner.scrape_post_details(permalink)
    update = "UPDATE post SET body = ? WHERE permalink = ?"
    binds = [post_details["body"], permalink]
    cursor.execute(update, binds)

    upsert += "INSERT INTO comments(permalink, comments) VALUES (?, ?) ON CONFLICT(permalink) DO UPDATE SET comments=excluded.comments"
    binds = [permalink, post_details["comments"]]
    cursor.execute(upsert, binds)

# Main execution
if __name__ == "__main__":
    os.makedirs(config.media_dir, exist_ok=True)
    connection = sqlite3.connect(config.db_file)
    cursor = connection.cursor()
    for subreddit in config.subreddits:
        post_data = scrape_subreddit_data(subreddit, config.max_posts_per_pull)
        save_posts_to_db(post_data, cursor)
        connection.commit()
    download_media(cursor)
    connection.commit()
    connection.close()