reddit/app/scrape_posts.py

118 lines
4.1 KiB
Python
Executable File

import config
import datetime
import json
import os
import re
import sqlite3
import sys
current_dir = os.path.dirname(os.path.abspath(__file__))
project_root = config.yars_dir
src_path = os.path.join(project_root, "src")
sys.path.append(src_path)
from yars.yars import YARS
from yars.utils import download_image
# Initialize the YARS Reddit miner
miner = YARS()
# Function to scrape subreddit post details and save to JSON
def scrape_subreddit_data(subreddit, limit=5):
ret = []
subreddit_name = subreddit[0]
minimum_score = subreddit[1]
print(f"Starting {subreddit_name}")
empty = dict()
try:
subreddit_posts = miner.fetch_subreddit_posts(
subreddit_name, limit=limit, category="top", time_filter=config.pull_by
)
for i, post in enumerate(subreddit_posts, 1):
score = post.get("score", 0)
if score < minimum_score:
continue
post_data = {
"permalink": post.get("permalink"),
"title": post.get("title", ""),
"author": post.get("author", ""),
"created_utc": post.get("created_utc", ""),
"num_comments": post.get("num_comments", 0),
"score": post.get("score", 0),
"media_urls" : post.get("media_urls", []),
"body": post.get("body", None),
}
ret.append(post_data)
print(f"Finished {subreddit_name}")
return ret
except Exception as e:
print(f"Error occurred while scraping subreddit: {e}")
return ret
def save_posts_to_db(data, cursor):
if len(data)==0:
return
upsert = "INSERT INTO post(permalink, subreddit, created_utc, score, media_fetched, post, hidden) VALUES "
upsert += ",".join(["(?,?,?,?,?,?,?)"] * len(data))
upsert += " ON CONFLICT(permalink) DO UPDATE SET score=excluded.score, post=excluded.post"
binds = []
for post in data:
binds.append(post["permalink"])
m = re.search(r"\/r\/([a-zA-Z0-9_]+)\/.*", post["permalink"])
binds.append(m.group(1)) #subreddit
binds.append(post["created_utc"])
binds.append(post["score"])
binds.append(False)
binds.append(json.dumps(post))
binds.append(False)
cursor.execute(upsert, binds)
def download_media(cursor):
select = "SELECT post FROM post WHERE media_fetched = ?"
binds = [False]
results = cursor.execute(select, binds)
post = results.fetchone()
binds = []
while post is not None:
post = json.loads(post[0])
if len(post["media_urls"])>0:
for url in post["media_urls"]:
binds.append(post["permalink"])
binds.append(url)
binds.append(download_image(url, config.media_dir))
print("image downloaded")
post = results.fetchone()
if len(binds)>0:
upsert = "INSERT INTO media(permalink, url, local) VALUES "
upsert += ",".join(["(?,?,?)"] * (len(binds)//3))
upsert += " ON CONFLICT(permalink, url) DO UPDATE SET local=excluded.local"
cursor.execute(upsert, binds)
update = "UPDATE post SET media_fetched = ? WHERE media_fetched = ?"
binds = [True, False]
cursor.execute(update, binds)
def download_comments_for_permalink(permalink, cursor):
# Currently unused
post_details = miner.scrape_post_details(permalink)
update = "UPDATE post SET body = ? WHERE permalink = ?"
binds = [post_details["body"], permalink]
cursor.execute(update, binds)
upsert += "INSERT INTO comments(permalink, comments) VALUES (?, ?) ON CONFLICT(permalink) DO UPDATE SET comments=excluded.comments"
binds = [permalink, post_details["comments"]]
cursor.execute(upsert, binds)
# Main execution
if __name__ == "__main__":
os.makedirs(config.media_dir, exist_ok=True)
connection = sqlite3.connect(config.db_file)
cursor = connection.cursor()
for subreddit in config.subreddits:
post_data = scrape_subreddit_data(subreddit, config.max_posts_per_pull)
save_posts_to_db(post_data, cursor)
connection.commit()
download_media(cursor)
connection.commit()
connection.close()