reddit/app/scrape_posts.py

160 lines
4.6 KiB
Python
Executable File

import config
import datetime
import json
import os
import re
import sqlite3
import sys
current_dir = os.path.dirname(os.path.abspath(__file__))
project_root = config.yars_dir
src_path = os.path.join(project_root, "src")
sys.path.append(src_path)
from yars.yars import YARS
from yars.utils import download_image
# Initialize the YARS Reddit miner
miner = YARS()
# Function to scrape subreddit post details and save to JSON
def scrape_subreddit_data(subreddit, minimum_score=100, pull_by="day", limit=5, blocked_subs=[], blocked_users=[], blocked_words=[]):
ret = []
print(f"Starting {subreddit}")
empty = dict()
try:
subreddit_posts = miner.fetch_subreddit_posts(
subreddit, limit=limit, category="top", time_filter=pull_by
)
for i, post in enumerate(subreddit_posts, 1):
score = post.get("score", 0)
if score < minimum_score:
break
post_data = {
"permalink": post.get("permalink"),
"subreddit": post.get("subreddit"),
"title": post.get("title", ""),
"author": post.get("author", ""),
"created_utc": post.get("created_utc", ""),
"num_comments": post.get("num_comments", 0),
"score": post.get("score", 0),
"media_urls" : post.get("media_urls", []),
"body": post.get("body", None),
}
if post_data["subreddit"] in blocked_subs:
continue
if post_data["author"] in blocked_users:
continue
for word in blocked_words:
if word in post_data["title"]:
break
if post_data["body"] is not None and word in post_data["body"]:
break
else:
ret.append(post_data)
print(f"Finished {subreddit}")
return ret
except Exception as e:
print(f"Error occurred while scraping subreddit: {e}")
return ret
def save_posts_to_db(data, cursor):
if len(data)==0:
return
upsert = "INSERT INTO post(permalink, subreddit, created_utc, score, media_fetched, post, hidden, saved, author) VALUES "
upsert += ",".join(["(?,?,?,?,?,?,?,?,?)"] * len(data))
upsert += " ON CONFLICT(permalink) DO UPDATE SET score=excluded.score, post=excluded.post"
binds = []
for post in data:
binds.append(post["permalink"])
binds.append(post["subreddit"])
binds.append(post["created_utc"])
binds.append(post["score"])
binds.append(False)
binds.append(json.dumps(post))
binds.append(False)
binds.append(False)
binds.append(post["author"])
cursor.execute(upsert, binds)
def download_media(cursor):
select = "SELECT post FROM post WHERE media_fetched = ? AND hidden = ?"
binds = [False, False]
results = cursor.execute(select, binds)
post = results.fetchone()
binds = []
while post is not None:
post = json.loads(post[0])
if len(post["media_urls"])>0:
for url in post["media_urls"]:
binds.append(post["permalink"])
binds.append(url)
path = download_image(url, config.media_dir)
binds.append(path)
print(f"Downloaded {path}")
post = results.fetchone()
if len(binds)>0:
upsert = "INSERT INTO media(permalink, url, local) VALUES "
upsert += ",".join(["(?,?,?)"] * (len(binds)//3))
upsert += " ON CONFLICT(permalink, url) DO UPDATE SET local=excluded.local"
cursor.execute(upsert, binds)
update = "UPDATE post SET media_fetched = ? WHERE media_fetched = ?"
binds = [True, False]
cursor.execute(update, binds)
def download_comments_for_permalink(permalink, cursor):
# Currently unused
post_details = miner.scrape_post_details(permalink)
update = "UPDATE post SET body = ? WHERE permalink = ?"
binds = [post_details["body"], permalink]
cursor.execute(update, binds)
upsert += "INSERT INTO comments(permalink, comments) VALUES (?, ?) ON CONFLICT(permalink) DO UPDATE SET comments=excluded.comments"
binds = [permalink, post_details["comments"]]
cursor.execute(upsert, binds)
def get_blocks(cursor):
select = """
SELECT
name
FROM
block
"""
blocks = [row[0] for row in cursor.execute(select)]
subs = []
users = []
words = []
for block in blocks:
if "/r/" in block:
subs.append(block[3:])
elif "/u/" in block:
users.append(block[3:])
else:
words.append(block)
return subs, users, words
# Main execution
if __name__ == "__main__":
os.makedirs(config.media_dir, exist_ok=True)
connection = sqlite3.connect(config.db_file)
cursor = connection.cursor()
select = """
SELECT
subreddit,
minimum_score,
fetch_by,
fetch_max
FROM
subreddit
"""
subreddits = cursor.execute(select).fetchall()
blocked_subs, blocked_users, blocked_words = get_blocks(cursor)
for subreddit in subreddits:
post_data = scrape_subreddit_data(subreddit[0], subreddit[1], subreddit[2], subreddit[3], blocked_subs, blocked_users, blocked_words)
save_posts_to_db(post_data, cursor)
connection.commit()
download_media(cursor)
connection.commit()
connection.close()