Initial Release

This commit is contained in:
John Stephani 2025-12-23 13:44:14 -06:00
commit 6609e7253b
23 changed files with 9093 additions and 0 deletions

5
.gitignore vendored Normal file
View File

@ -0,0 +1,5 @@
media/
db/
**/YARS.log
**/__pycache__/
**/.venv/

28
README.md Executable file
View File

@ -0,0 +1,28 @@
### Places to configure
## ./compose.yaml
You can change the host port, host volume directories, how often reddit is scanned for new data, and how often old data is removed.
## ./app/config.py
You can change how much data is pulled, from where, the minimum score to save it to your DB, and how long it is retained.
### Startup
```
docker compose build
docker compose up
```
The DB is created automatically. You will want to run
```
docker exec -it reddit-web-1 sh -c "python3 /app/scrape_posts.py"
```
to populate the DB with initial data, or you will have to wait for the scheduled task to get triggered for the web page to be usable.
### Thanks
This wouldn't be possible without https://github.com/datavorous/yars, which has been yoinked and tweaked to handle some more complex media cases. I will port my knowledge back to the project when time permits.

27
app/Dockerfile Executable file
View File

@ -0,0 +1,27 @@
# syntax=docker/dockerfile:1.4
FROM --platform=$BUILDPLATFORM python:3.10-alpine AS builder
WORKDIR /app
COPY requirements.txt /app
RUN --mount=type=cache,target=/root/.cache/pip \
pip3 install -r requirements.txt
COPY . /app
ENTRYPOINT ["python3"]
CMD ["app.py"]
FROM builder as dev-envs
RUN <<EOF
apk update
apk add git
EOF
RUN <<EOF
addgroup -S docker
adduser -S --shell /bin/bash --ingroup docker vscode
EOF
# install Docker tools (cli, buildx, compose)
COPY --from=gloursdocker/docker / /

162
app/YARS/.gitignore vendored Normal file
View File

@ -0,0 +1,162 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
.pdm.toml
.pdm-python
.pdm-build/
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/

View File

@ -0,0 +1,41 @@
ci:
autofix_commit_msg: "[pre-commit.ci] auto fixes from pre-commit.com hooks"
autofix_prs: true
autoupdate_commit_msg: "[pre-commit.ci] pre-commit autoupdate"
autoupdate_schedule: quarterly
submodules: false
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-ast
- id: check-yaml
- id: check-toml
- id: check-merge-conflict
- id: mixed-line-ending
- id: check-case-conflict
- id: sort-simple-yaml
files: .pre-commit-config.yaml
- repo: https://github.com/hadialqattan/pycln
rev: v2.4.0
hooks:
- id: pycln
args: [--config=pyproject.toml, src]
types: [file]
types_or: [python, pyi]
- repo: https://github.com/psf/black-pre-commit-mirror
rev: 24.8.0
hooks:
- id: black
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.6.9
hooks:
- id: ruff
types: [file]
types_or: [python, pyi, toml]
- repo: https://github.com/codespell-project/codespell
rev: v2.3.0
hooks:
- id: codespell

21
app/YARS/LICENSE Normal file
View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2024 Sagnik Bhattacharjee
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

172
app/YARS/README.md Normal file
View File

@ -0,0 +1,172 @@
<div align="center">
<img src="logo.svg" width="10%">
# YARS (Yet Another Reddit Scraper)
[![GitHub stars](https://img.shields.io/github/stars/datavorous/yars.svg?style=social&label=Stars&style=plastic)](https://github.com/datavorous/yars/stargazers)<br>
</div>
YARS is a Python package designed to simplify the process of scraping Reddit for posts, comments, user data, and other media. The package also includes utility functions. It is built using **Python** and relies on the **requests** module for fetching data from Reddits public API. The scraper uses simple `.json` requests, avoiding the need for official Reddit API keys, making it lightweight and easy to use.
## Features
- **Reddit Search**: Search Reddit for posts using a keyword query.
- **Post Scraping**: Scrape post details, including title, body, and comments.
- **User Data Scraping**: Fetch recent activity (posts and comments) of a Reddit user.
- **Subreddit Posts Fetching**: Retrieve posts from specific subreddits with flexible options for category and time filters.
- **Image Downloading**: Download images from posts.
- **Results Display**: Utilize `Pygments` for colorful display of JSON-formatted results.
> [!WARNING]
> Use with rotating proxies, or Reddit might gift you with an IP ban.
> I could extract max 2552 posts at once from 'r/all' using this.
> [Here](https://files.catbox.moe/zdra2i.json) is a **7.1 MB JSON** file containing the top 100 posts from 'r/nosleep', which included post titles, body text, all comments and their replies, post scores, time of upload etc.
## Dependencies
- `requests`
- `Pygments`
## Installation
1. Clone the repository:
```
git clone https://github.com/datavorous/YARS.git
```
Navigate inside the ```src``` folder.
2. Install ```uv``` (if not already installed):
```
pip install uv
```
3. Run the application:
```
uv run example/example.py
```
It'll setup the virtual env, install the necessary packages and run the ```example.py``` program.
## Usage
We will use the following Python script to demonstrate the functionality of the scraper. The script includes:
- Searching Reddit
- Scraping post details
- Fetching user data
- Retrieving subreddit posts
- Downloading images from posts
#### Code Overview
```python
from yars import YARS
from utils import display_results, download_image
miner = YARS()
```
#### Step 1: Searching Reddit
The `search_reddit` method allows you to search Reddit using a query string. Here, we search for posts containing "OpenAI" and limit the results to 3 posts. The `display_results` function is used to present the results in a formatted way.
```python
search_results = miner.search_reddit("OpenAI", limit=3)
display_results(search_results, "SEARCH")
```
#### Step 2: Scraping Post Details
Next, we scrape details of a specific Reddit post by passing its permalink. If the post details are successfully retrieved, they are displayed using `display_results`. Otherwise, an error message is printed.
```python
permalink = "https://www.reddit.com/r/getdisciplined/comments/1frb5ib/what_single_health_test_or_practice_has/".split('reddit.com')[1]
post_details = miner.scrape_post_details(permalink)
if post_details:
display_results(post_details, "POST DATA")
else:
print("Failed to scrape post details.")
```
#### Step 3: Fetching User Data
We can also retrieve a Reddit users recent activity (posts and comments) using the `scrape_user_data` method. Here, we fetch data for the user `iamsecb` and limit the results to 2 items.
```python
user_data = miner.scrape_user_data("iamsecb", limit=2)
display_results(user_data, "USER DATA")
```
#### Step 4: Fetching Subreddit Posts
The `fetch_subreddit_posts` method retrieves posts from a specified subreddit. In this example, we fetch 11 top posts from the "generative" subreddit from the past week.
```python
subreddit_posts = miner.fetch_subreddit_posts("generative", limit=11, category="top", time_filter="week")
display_results(subreddit_posts, "EarthPorn SUBREDDIT New Posts")
```
#### Step 5: Downloading Images
For the posts retrieved from the subreddit, we try to download their associated images. The `download_image` function is used for this. If the post doesn't have an `image_url`, the thumbnail URL is used as a fallback.
```python
for z in range(3):
try:
image_url = subreddit_posts[z]["image_url"]
except:
image_url = subreddit_posts[z]["thumbnail_url"]
download_image(image_url)
```
### Complete Code Example
```python
from yars import YARS
from utils import display_results, download_image
miner = YARS()
# Search for posts related to "OpenAI"
search_results = miner.search_reddit("OpenAI", limit=3)
display_results(search_results, "SEARCH")
# Scrape post details using its permalink
permalink = "https://www.reddit.com/r/getdisciplined/comments/1frb5ib/what_single_health_test_or_practice_has/".split('reddit.com')[1]
post_details = miner.scrape_post_details(permalink)
if post_details:
display_results(post_details, "POST DATA")
else:
print("Failed to scrape post details.")
# Fetch recent activity of user "iamsecb"
user_data = miner.scrape_user_data("iamsecb", limit=2)
display_results(user_data, "USER DATA")
# Fetch top posts from the subreddit "generative" from the past week
subreddit_posts = miner.fetch_subreddit_posts("generative", limit=11, category="top", time_filter="week")
display_results(subreddit_posts, "EarthPorn SUBREDDIT New Posts")
# Download images from the fetched posts
for z in range(3):
try:
image_url = subreddit_posts[z]["image_url"]
except:
image_url = subreddit_posts[z]["thumbnail_url"]
download_image(image_url)
```
You can now use these techniques to explore and scrape data from Reddit programmatically.
## Contributing
Contributions are welcome! For feature requests, bug reports, or questions, please open an issue. If you would like to contribute code, please open a pull request with your changes.
### Our Notable Contributors
<a href="https://github.com/datavorous/yars/graphs/contributors">
<img src="https://contrib.rocks/image?repo=datavorous/yars" />

4
app/YARS/logo.svg Normal file
View File

@ -0,0 +1,4 @@
<svg width="217" height="229" viewBox="0 0 217 229" fill="none" xmlns="http://www.w3.org/2000/svg">
<path d="M206.56 195.464C189.638 211.477 168.078 222.382 144.606 226.8C121.134 231.218 96.8052 228.95 74.6953 220.284C52.5854 211.618 33.6878 196.942 20.3922 178.113C7.09653 159.283 0 137.146 0 114.5C0 91.854 7.09653 69.7166 20.3922 50.8872C33.6878 32.0578 52.5855 17.382 74.6953 8.71579C96.8052 0.0495596 121.134 -2.21792 144.606 2.2001C168.078 6.61811 189.638 17.5232 206.56 33.5363L179.384 59.2528C167.836 48.3259 153.124 40.8846 137.108 37.8699C121.092 34.8552 104.49 36.4025 89.4031 42.316C74.316 48.2296 61.4208 58.2439 52.3483 71.0925C43.2757 83.9412 38.4332 99.0471 38.4332 114.5C38.4332 129.953 43.2757 145.059 52.3483 157.907C61.4208 170.756 74.316 180.77 89.4031 186.684C104.49 192.598 121.092 194.145 137.108 191.13C153.124 188.115 167.836 180.674 179.384 169.747L206.56 195.464Z" fill="#FB471A"/>
<path d="M52.2642 51.0675C65.76 38.6206 82.9546 30.1442 101.674 26.7101C120.393 23.276 139.796 25.0385 157.429 31.7747C175.062 38.5109 190.133 49.9183 200.737 64.5543C211.34 79.1902 217 96.3975 217 114C217 131.603 211.34 148.81 200.737 163.446C190.133 178.082 175.062 189.489 157.429 196.225C139.796 202.961 120.393 204.724 101.674 201.29C82.9546 197.856 65.7599 189.379 52.2642 176.932L80.2341 151.136C88.1979 158.481 98.3445 163.483 109.391 165.51C120.437 167.536 131.886 166.496 142.292 162.521C152.697 158.546 161.591 151.815 167.848 143.178C174.105 134.541 177.445 124.387 177.445 114C177.445 103.613 174.105 93.4588 167.848 84.8221C161.591 76.1854 152.697 69.454 142.292 65.4789C131.886 61.5039 120.437 60.4639 109.391 62.4903C98.3445 64.5168 88.1979 69.5187 80.2341 76.8636L52.2642 51.0675Z" fill="#FFBA4A"/>
</svg>

After

Width:  |  Height:  |  Size: 1.7 KiB

0
app/YARS/src/README.md Normal file
View File

View File

@ -0,0 +1,12 @@
[project]
name = "sm"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.13"
dependencies = [
"flask>=3.0.3",
"meta-ai-api>=1.2.1",
"pygments>=2.18.0",
"requests>=2.32.3",
]

View File

7542
app/YARS/src/yars/agents.py Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,15 @@
from requests import Session
from .agents import get_agent
class RandomUserAgentSession(Session):
"""
Session class (inherited from requests.Session) which passes
a random user agent with each request
"""
def request(self, *args, **kwargs):
self.headers.update({"User-Agent": get_agent()})
return super().request(*args, **kwargs)

View File

@ -0,0 +1,96 @@
import os
import csv
import json
import logging
import requests
from urllib.parse import urlparse
from pygments import formatters, highlight, lexers
logging.basicConfig(
level=logging.INFO, filename="YARS.log", format="%(asctime)s - %(message)s"
)
def display_results(results, title):
try:
print(f"\n{'-'*20} {title} {'-'*20}")
if isinstance(results, list):
for item in results:
if isinstance(item, dict):
formatted_json = json.dumps(item, sort_keys=True, indent=4)
colorful_json = highlight(
formatted_json,
lexers.JsonLexer(),
formatters.TerminalFormatter(),
)
print(colorful_json)
else:
print(item)
elif isinstance(results, dict):
formatted_json = json.dumps(results, sort_keys=True, indent=4)
colorful_json = highlight(
formatted_json, lexers.JsonLexer(), formatters.TerminalFormatter()
)
print(colorful_json)
else:
logging.warning(
"No results to display: expected a list or dictionary, got %S",
type(results),
)
print("No results to display.")
except Exception as e:
logging.error(f"Error displaying results: {e}")
print("Error displaying results.")
def download_image(image_url, output_folder="images", session=None):
os.makedirs(output_folder, exist_ok=True)
filename = urlparse(image_url).path
if filename[0]=='/':
filename = filename[1:]
filepath = os.path.join(output_folder, filename)
os.makedirs(filepath[:filepath.rfind('/')], exist_ok=True)
if session is None:
session = requests.Session()
try:
response = session.get(image_url, stream=True)
response.raise_for_status()
with open(filepath, "wb") as f:
for chunk in response.iter_content(8192):
f.write(chunk)
logging.info("Downloaded: %s", filepath)
return filepath
except requests.RequestException as e:
logging.error("Failed to download %s: %s", image_url, e)
return None
except Exception as e:
logging.error("An error occurred while saving the image: %s", e)
return None
def export_to_json(data, filename="output.json"):
try:
with open(filename, "w", encoding="utf-8") as json_file:
json.dump(data, json_file, indent=4)
print(f"Data successfully exported to {filename}")
except Exception as e:
print(f"Error exporting to JSON: {e}")
def export_to_csv(data, filename="output.csv"):
try:
keys = data[0].keys()
with open(filename, "w", newline="", encoding="utf-8") as output_file:
dict_writer = csv.DictWriter(output_file, fieldnames=keys)
dict_writer.writeheader()
dict_writer.writerows(data)
print(f"Data successfully exported to {filename}")
except Exception as e:
print(f"Error exporting to CSV: {e}")

309
app/YARS/src/yars/yars.py Normal file
View File

@ -0,0 +1,309 @@
from __future__ import annotations
from .sessions import RandomUserAgentSession
import time
import random
import logging
import re
import requests
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
logger = logging.basicConfig(
filename="YARS.log",
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
)
class YARS:
__slots__ = ("headers", "session", "proxy", "timeout")
def __init__(self, proxy=None, timeout=10, random_user_agent=True):
self.session = RandomUserAgentSession() if random_user_agent else requests.Session()
self.proxy = proxy
self.timeout = timeout
retries = Retry(
total=5,
backoff_factor=2, # Exponential backoff
status_forcelist=[429, 500, 502, 503, 504],
)
self.session.mount("https://", HTTPAdapter(max_retries=retries))
if proxy:
self.session.proxies.update({"http": proxy, "https": proxy})
def handle_search(self,url, params, after=None, before=None):
if after:
params["after"] = after
if before:
params["before"] = before
try:
response = self.session.get(url, params=params, timeout=self.timeout)
response.raise_for_status()
logging.info("Search request successful")
except Exception as e:
if response.status_code != 200:
logging.info("Search request unsuccessful due to: %s", e)
print(f"Failed to fetch search results: {response.status_code}")
return []
data = response.json()
results = []
for post in data["data"]["children"]:
post_data = post["data"]
results.append(
{
"title": post_data["title"],
"link": f"https://www.reddit.com{post_data['permalink']}",
"description": post_data.get("selftext", "")[:269],
}
)
logging.info("Search Results Retrned %d Results", len(results))
return results
def search_reddit(self, query, limit=10, after=None, before=None):
url = "https://www.reddit.com/search.json"
params = {"q": query, "limit": limit, "sort": "relevance", "type": "link"}
return self.handle_search(url, params, after, before)
def search_subreddit(self, subreddit, query, limit=10, after=None, before=None, sort="relevance"):
url = f"https://www.reddit.com/r/{subreddit}/search.json"
params = {"q": query, "limit": limit, "sort": "relevance", "type": "link","restrict_sr":"on"}
return self.handle_search(url, params, after, before)
def scrape_post_details(self, permalink):
url = f"https://www.reddit.com{permalink}.json"
try:
response = self.session.get(url, timeout=self.timeout)
response.raise_for_status()
logging.info("Post details request successful : %s", url)
except Exception as e:
logging.info("Post details request unsccessful: %e", e)
if response.status_code != 200:
print(f"Failed to fetch post data: {response.status_code}")
return None
post_data = response.json()
if not isinstance(post_data, list) or len(post_data) < 2:
logging.info("Unexpected post data structre")
print("Unexpected post data structure")
return None
main_post = post_data[0]["data"]["children"][0]["data"]
title = main_post["title"]
body = main_post.get("selftext", "")
comments = self._extract_comments(post_data[1]["data"]["children"])
logging.info("Successfully scraped post: %s", title)
return {"title": title, "body": body, "comments": comments}
def _extract_comments(self, comments):
logging.info("Extracting comments")
extracted_comments = []
for comment in comments:
if isinstance(comment, dict) and comment.get("kind") == "t1":
comment_data = comment.get("data", {})
extracted_comment = {
"author": comment_data.get("author", ""),
"body": comment_data.get("body", ""),
"score": comment_data.get("score",""),
"replies": [],
}
replies = comment_data.get("replies", "")
if isinstance(replies, dict):
extracted_comment["replies"] = self._extract_comments(
replies.get("data", {}).get("children", [])
)
extracted_comments.append(extracted_comment)
logging.info("Successfully extracted comments")
return extracted_comments
def scrape_user_data(self, username, limit=10):
logging.info("Scraping user data for %s, limit: %d", username, limit)
base_url = f"https://www.reddit.com/user/{username}/.json"
params = {"limit": limit, "after": None}
all_items = []
count = 0
while count < limit:
try:
response = self.session.get(
base_url, params=params, timeout=self.timeout
)
response.raise_for_status()
logging.info("User data request successful")
except Exception as e:
logging.info("User data request unsuccessful: %s", e)
if response.status_code != 200:
print(
f"Failed to fetch data for user {username}: {response.status_code}"
)
break
try:
data = response.json()
except ValueError:
print(f"Failed to parse JSON response for user {username}.")
break
if "data" not in data or "children" not in data["data"]:
print(
f"No 'data' or 'children' field found in response for user {username}."
)
logging.info("No 'data' or 'children' field found in response")
break
items = data["data"]["children"]
if not items:
print(f"No more items found for user {username}.")
logging.info("No more items found for user")
break
for item in items:
kind = item["kind"]
item_data = item["data"]
if kind == "t3":
post_url = f"https://www.reddit.com{item_data.get('permalink', '')}"
all_items.append(
{
"type": "post",
"title": item_data.get("title", ""),
"subreddit": item_data.get("subreddit", ""),
"url": post_url,
"created_utc": item_data.get("created_utc", ""),
}
)
elif kind == "t1":
comment_url = (
f"https://www.reddit.com{item_data.get('permalink', '')}"
)
all_items.append(
{
"type": "comment",
"subreddit": item_data.get("subreddit", ""),
"body": item_data.get("body", ""),
"created_utc": item_data.get("created_utc", ""),
"url": comment_url,
}
)
count += 1
if count >= limit:
break
params["after"] = data["data"].get("after")
if not params["after"]:
break
time.sleep(random.uniform(1, 2))
logging.info("Sleeping for random time")
logging.info("Successfully scraped user data for %s", username)
return all_items
def fetch_subreddit_posts(
self, subreddit, limit=10, category="hot", time_filter="all"
):
logging.info(
"Fetching subreddit/user posts for %s, limit: %d, category: %s, time_filter: %s",
subreddit,
limit,
category,
time_filter,
)
if category not in ["hot", "top", "new", "userhot", "usertop", "usernew"]:
raise ValueError("Category for Subredit must be either 'hot', 'top', or 'new' or for User must be 'userhot', 'usertop', or 'usernew'")
batch_size = min(100, limit)
total_fetched = 0
after = None
all_posts = []
while total_fetched < limit:
if category == "hot":
url = f"https://www.reddit.com/r/{subreddit}/hot.json"
elif category == "top":
url = f"https://www.reddit.com/r/{subreddit}/top.json"
elif category == "new":
url = f"https://www.reddit.com/r/{subreddit}/new.json"
elif category == "userhot":
url = f"https://www.reddit.com/user/{subreddit}/submitted/hot.json"
elif category == "usertop":
url = f"https://www.reddit.com/user/{subreddit}/submitted/top.json"
else:
url = f"https://www.reddit.com/user/{subreddit}/submitted/new.json"
params = {
"limit": batch_size,
"after": after,
"raw_json": 1,
"t": time_filter,
}
try:
response = self.session.get(url, params=params, timeout=self.timeout)
response.raise_for_status()
logging.info("Subreddit/user posts request successful")
except Exception as e:
logging.info("Subreddit/user posts request unsuccessful: %s", e)
if response.status_code != 200:
print(
f"Failed to fetch posts for subreddit/user {subreddit}: {response.status_code}"
)
break
data = response.json()
posts = data["data"]["children"]
if not posts:
break
for post in posts:
post_data = post["data"]
post_info = {
"title": post_data["title"],
"author": post_data["author"],
"permalink": post_data["permalink"],
"score": post_data["score"],
"num_comments": post_data["num_comments"],
"created_utc": post_data["created_utc"],
}
if "selftext" in post_data:
body = post_data["selftext"]
if body != None and len(body)>0:
post_info["body"] = body
if "media_metadata" in post_data:
media_urls = []
for image in post_data["media_metadata"]:
if "m" not in post_data["media_metadata"][image]:
continue
content_type = post_data["media_metadata"][image]["m"]
extension = content_type[content_type.find('/')+1:]
media_urls.append("https://i.redd.it/{}.{}".format(image, extension))
post_info["media_urls"] = media_urls
elif "media" in post_data and post_data["media"] is not None and "reddit_video" in post_data["media"]:
media_url = post_data["media"]["reddit_video"]["fallback_url"]
video_url = media_url[:media_url.find('?')]
audio_url = video_url[:video_url.rfind('/')] + "/CMAF_AUDIO_128.mp4"
post_info["media_urls"] = [video_url, audio_url]
elif "url" in post_data:
url = post_data["url"]
if re.fullmatch(r"https:\/\/i\.redd\.it\/.{1,20}", url):
post_info["media_urls"] = [url]
elif "body" not in post_info:
post_info["body"] = url
if "thumbnail" in post_data and post_data["thumbnail"] != "self":
post_info["thumbnail_url"] = post_data["thumbnail"]
all_posts.append(post_info)
total_fetched += 1
if total_fetched >= limit:
break
after = data["data"].get("after")
if not after:
break
time.sleep(random.uniform(1, 2))
logging.info("Sleeping for random time")
logging.info("Successfully fetched subreddit posts for %s", subreddit)
return all_posts

87
app/app.py Executable file
View File

@ -0,0 +1,87 @@
from flask import Flask, send_file, render_template, request
from urllib.parse import urlparse
import config
import json
import sqlite3
import subprocess
app = Flask(__name__)
@app.route('/file/<path:filename>')
def serve_file(filename):
try:
return send_file('{0}/{1}'.format(config.media_dir, filename), as_attachment=False)
except FileNotFoundError:
return "File not found", 404
@app.route('/hide/<path:permalink>')
def hide_post(permalink):
if permalink[0] != "/":
permalink = "/" + permalink
connection = sqlite3.connect(config.db_file)
cursor = connection.cursor()
update = "UPDATE post SET hidden = ? WHERE permalink = ?"
binds = [True, permalink]
print(update)
print(binds)
cursor.execute(update,binds)
connection.commit()
connection.close()
return ""
@app.route('/', methods=['GET', 'POST'])
def index():
connection = sqlite3.connect(config.db_file)
cursor = connection.cursor()
select = "SELECT subreddit, count(*) count FROM post WHERE hidden = ? GROUP BY subreddit ORDER BY count desc LIMIT 1"
binds = [False]
row = cursor.execute(select, binds).fetchone()
connection.close()
return get_subreddit(row[0])
@app.route('/r/<path:subreddit>')
def get_subreddit(subreddit):
title = f"/r/{subreddit}"
connection = sqlite3.connect(config.db_file)
cursor = connection.cursor()
select = "SELECT subreddit, count FROM (SELECT subreddit, count(*) count FROM post WHERE hidden = ? GROUP BY subreddit ORDER BY count desc) t WHERE count > 0"
binds = [False]
results = cursor.execute(select, binds).fetchall()
subreddits = [f"/r/{sub[0]}" for sub in results]
count = results[subreddits.index(title)][1]
select = "SELECT post FROM post WHERE subreddit = ? and hidden = ? ORDER BY score desc LIMIT ?"
binds = [subreddit, False, config.posts_per_page_load]
results = cursor.execute(select, binds).fetchall()
posts = [json.loads(post[0]) for post in results]
add_media_html_to_posts(posts)
connection.close()
return render_template('index.html', title=title, count=count, posts=posts, subreddits=subreddits)
def add_media_html_to_posts(posts):
for post_index, post in enumerate(posts):
media_html = []
for media_index, media in enumerate(post["media_urls"]):
filename = urlparse(media).path
if filename[0]=='/':
filename = filename[1:]
html = get_media_html(filename, True if (post_index < 3 and media_index == 0) else False)
media_html.append(html)
post["media_html"] = media_html
def get_media_html(file, priority=False):
if file.endswith('.jpg') or file.endswith('.jpeg') or file.endswith('.png') or file.endswith('.gif'):
return '<img class="invertible" src="/file/{0}" {1}>'.format(file, 'fetchpriority="high" loading="eager"' if priority else '')
if file.find("_AUDIO_")>0:
return '<audio src="/file/{0}" hidden></audio>'.format(file)
if file.endswith('.mp4'):
return '<video src="/file/{0}" type="video/mp4" onplay="playAudio(this)" onpause="pauseAudio(this)" onseeked="seekAudio(this)" controls></video>'.format(file)
if file.endswith('.webm'):
return '<video src="/file/{0}" type="video/webm" controls></video>'.format(file)
return file
if __name__ == '__main__':
subprocess.run(["python3", "make_db.py"])
#subprocess.run(["python3", "scrape_posts.py"])
app.run(host='0.0.0.0', port=8000)

32
app/config.py Normal file
View File

@ -0,0 +1,32 @@
# Scheduler configuration
max_posts_per_pull = 100
pull_by = "day"
subreddits = [
# name, minimum upvotes
("pcgaming", 50),
("gadgets", 10),
("Nightreign", 100),
("CuratedTumblr", 100),
("196", 100),
("PoliticalCompassMemes", 100),
("meirl", 100),
("me_irl", 100),
("AITAH", 100),
("Fauxmoi", 100),
("NoFilterNews", 100),
("linux", 100),
("linux4noobs", 100),
("selfhosted", 100),
("HomeServer", 100),
("homelab", 100)
]
max_age_days = 30
max_age_seconds = max_age_days * 24 * 60 * 60
# Webpage configuration
posts_per_page_load = 50
db_dir = "/reddit/db"
media_dir = "/reddit/media"
db_file = f"{db_dir}/data.db"
yars_dir = "/app/YARS"

39
app/delete_posts.py Normal file
View File

@ -0,0 +1,39 @@
import config
import os
import time
import sqlite3
import subprocess
if __name__ == "__main__":
connection = sqlite3.connect(config.db_file)
cursor = connection.cursor()
now = int(time.time())
max_created_utc = now - config.max_age_seconds
select = "SELECT count(*) FROM post WHERE created_utc < ?"
binds = [max_created_utc]
results = cursor.execute(select, binds)
print("Deleting old posts")
delete = "DELETE FROM post WHERE created_utc < ?"
cursor.execute(delete, binds)
print("Deleting old media db rows")
delete = "DELETE FROM media WHERE permalink NOT IN (SELECT permalink FROM post)"
cursor.execute(delete)
all_files_local = subprocess.run(["find", "/reddit/media", "-type", "f"], capture_output=True, text=True)
all_files_local = set(all_files_local.stdout.splitlines())
select = "SELECT local from media"
results = cursor.execute(select).fetchall()
connection.commit()
connection.close()
all_files_db = set([row[0] for row in results])
extra_files = all_files_local - all_files_db
print("Deleting old files")
for file in extra_files:
print(f"Removing {file}")
os.remove(file)
empty_dirs = subprocess.run(["find", "/reddit/media", "-type", "d", "-empty"], capture_output=True, text=True)
empty_dirs = set(empty_dirs.stdout.splitlines())
print("Deleting empty directories")
for dir in empty_dirs:
print(f"Removind dir {dir}")
os.rmdir(dir)
print("Done")

12
app/make_db.py Executable file
View File

@ -0,0 +1,12 @@
import config
import datetime
import os
import sqlite3
os.makedirs(config.db_dir, exist_ok=True)
connection = sqlite3.connect(config.db_file)
cursor = connection.cursor()
cursor.execute("CREATE TABLE IF NOT EXISTS post(permalink primary key, subreddit, created_utc, score, media_fetched, post, hidden)")
cursor.execute("CREATE TABLE IF NOT EXISTS media(permalink, url , local, PRIMARY KEY (permalink, url))")
connection.commit()
connection.close()

3
app/requirements.txt Executable file
View File

@ -0,0 +1,3 @@
flask
requests
pygments

118
app/scrape_posts.py Executable file
View File

@ -0,0 +1,118 @@
import config
import datetime
import json
import os
import re
import sqlite3
import sys
current_dir = os.path.dirname(os.path.abspath(__file__))
project_root = config.yars_dir
src_path = os.path.join(project_root, "src")
sys.path.append(src_path)
from yars.yars import YARS
from yars.utils import download_image
# Initialize the YARS Reddit miner
miner = YARS()
# Function to scrape subreddit post details and save to JSON
def scrape_subreddit_data(subreddit, limit=5):
ret = []
subreddit_name = subreddit[0]
minimum_score = subreddit[1]
print(f"Starting {subreddit_name}")
empty = dict()
try:
subreddit_posts = miner.fetch_subreddit_posts(
subreddit_name, limit=limit, category="top", time_filter=config.pull_by
)
for i, post in enumerate(subreddit_posts, 1):
score = post.get("score", 0)
if score < minimum_score:
continue
post_data = {
"permalink": post.get("permalink"),
"title": post.get("title", ""),
"author": post.get("author", ""),
"created_utc": post.get("created_utc", ""),
"num_comments": post.get("num_comments", 0),
"score": post.get("score", 0),
"media_urls" : post.get("media_urls", []),
"body": post.get("body", None),
}
ret.append(post_data)
print(f"Finished {subreddit_name}")
return ret
except Exception as e:
print(f"Error occurred while scraping subreddit: {e}")
return ret
def save_posts_to_db(data, cursor):
if len(data)==0:
return
upsert = "INSERT INTO post(permalink, subreddit, created_utc, score, media_fetched, post, hidden) VALUES "
upsert += ",".join(["(?,?,?,?,?,?,?)"] * len(data))
upsert += " ON CONFLICT(permalink) DO UPDATE SET score=excluded.score, post=excluded.post"
binds = []
for post in data:
binds.append(post["permalink"])
m = re.search(r"\/r\/([a-zA-Z0-9_]+)\/.*", post["permalink"])
binds.append(m.group(1)) #subreddit
binds.append(post["created_utc"])
binds.append(post["score"])
binds.append(False)
binds.append(json.dumps(post))
binds.append(False)
cursor.execute(upsert, binds)
def download_media(cursor):
select = "SELECT post FROM post WHERE media_fetched = ?"
binds = [False]
results = cursor.execute(select, binds)
post = results.fetchone()
binds = []
while post is not None:
post = json.loads(post[0])
if len(post["media_urls"])>0:
for url in post["media_urls"]:
binds.append(post["permalink"])
binds.append(url)
binds.append(download_image(url, config.media_dir))
print("image downloaded")
post = results.fetchone()
if len(binds)>0:
upsert = "INSERT INTO media(permalink, url, local) VALUES "
upsert += ",".join(["(?,?,?)"] * (len(binds)//3))
upsert += " ON CONFLICT(permalink, url) DO UPDATE SET local=excluded.local"
cursor.execute(upsert, binds)
update = "UPDATE post SET media_fetched = ? WHERE media_fetched = ?"
binds = [True, False]
cursor.execute(update, binds)
def download_comments_for_permalink(permalink, cursor):
# Currently unused
post_details = miner.scrape_post_details(permalink)
update = "UPDATE post SET body = ? WHERE permalink = ?"
binds = [post_details["body"], permalink]
cursor.execute(update, binds)
upsert += "INSERT INTO comments(permalink, comments) VALUES (?, ?) ON CONFLICT(permalink) DO UPDATE SET comments=excluded.comments"
binds = [permalink, post_details["comments"]]
cursor.execute(upsert, binds)
# Main execution
if __name__ == "__main__":
os.makedirs(config.media_dir, exist_ok=True)
connection = sqlite3.connect(config.db_file)
cursor = connection.cursor()
for subreddit in config.subreddits:
post_data = scrape_subreddit_data(subreddit, config.max_posts_per_pull)
save_posts_to_db(post_data, cursor)
connection.commit()
download_media(cursor)
connection.commit()
connection.close()

342
app/templates/index.html Executable file
View File

@ -0,0 +1,342 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Reddit, but better</title>
<style>
:root {
--dark: #2c2c2c;
--darker: #171717;
--light: #bfbfbf;
}
html {
height: 100%;
}
body {
background-color: var(--darker);
color: var(--light);
min-height: 100%;
margin: 0; /* Removes default browser margin */
}
img, video {
max-width: 100%;
max-height: 100vh;
width: auto;
height: auto;
}
div.post {
background-color: var(--dark);
border: 2px solid var(--light);
border-radius: 15px;
padding: 10px;
margin-bottom: 20px;
}
.sidebar {
outline: 2px solid var(--light);
position: sticky;
top: 0;
left: 0;
background-color: var(--dark);
display: flex;
flex-wrap: nowrap;
z-index: 1000;
}
.content {
display: flex;
flex-grow: 1; /* Takes up remaining space */
flex-direction: column;
}
/* desktop */
@media (min-aspect-ratio: 1) {
img, video {
max-height: 80vh;
}
div.post {
width: 70vw;
margin-left: 20px;
}
.container {
display: flex;
flex-direction: row;
}
.sidebar {
width: fit-content;
height: 100vh;
flex-direction: column;
overflow-y: auto;
padding: 5px
}
}
/* phone */
@media (max-aspect-ratio: 1) {
img, video {
max-height: 100vh;
}
div.post {
width: calc(100vw - 50px);
margin-top: 10px;
}
.container {
display: flex;
flex-direction: column;
}
.sidebar {
width: 100vw;
height: 50px;
flex-direction: row;
overflow-x: auto;
align-items: center;
padding-top: 5px;
padding-bottom: 5px;
}
.content {
align-items: center;
}
}
.sidebar a {
display: block;
color: var(--light);
text-decoration: none;
white-space: nowrap;
margin: 5px;
padding: 5px;
}
.sidebar a:hover {
background-color: var(--darker);
color: var(--light);
}
.content h1 {
margin-left: 20px;
}
.invert {
filter: invert(1);
transition: filter 0.3s;
}
.button-wrapper {
display: flex;
width: 100%;
gap: 10px;
margin-top: 10px;
}
.button-wrapper.gallery {
gap: 5px;
}
.button-wrapper button {
flex: 1;
padding: 10px;
cursor: pointer;
background-color: var(--darker);
color: var(--light);
border: 2px solid var(--light);
border-radius: 10px;
font-size: 1.25rem;
font-weight: bold;
}
.button-wrapper button.gallery {
padding: 5px;
background-color: var(--darker);
border-radius: 5px;
border: none;
cursor: none;
}
.button-wrapper button.gallery.selected {
background-color: var(--light);
}
.text-content {
overflow: hidden;
transition: max-height 0.3s ease-out; /* Smooth transition */
max-height: 20vh;
position: relative;
}
.text-content::after {
content: "";
position: absolute;
bottom: 0;
left: 0;
width: 100%;
height: 30px;
background: linear-gradient(to bottom, rgba(255,255,255,0), var(--dark));
}
.text-content.expanded {
max-height: 1000vh;
}
.text-content.expanded::after {
display: none;
}
</style>
</head>
<body>
<div class="container">
<div class="sidebar">
{% for subreddit in subreddits %}
<a href="{{ subreddit }}">{{ subreddit }}</a>
{% endfor %}
</div>
<div class="content">
<h1>{{ title }} ({{ count }})</h1>
{% for post in posts %}
<div class="post">
<h2>{{ post.title }}</h2>
<h4>Score: {{ post.score }}</h4>
<div class="media-div">
{% for media in post.media_html %}
{{ media|safe }}
{% endfor %}
{% if post.media_html|length > 1 %}
<span class="button-wrapper gallery">
</span>
{% endif %}
</div>
{% if post.body %}
<div class="text-content" onclick="expand(this)">
{{ post.body }}
</div>
{% endif %}
<span class="button-wrapper">
<button type="button" onclick='comments("{{ post.permalink }}")'>Comments</button>
<button type="button" onclick='hide(this, "{{ post.permalink }}")'>Hide</button>
</span>
</div>
{% endfor %}
</div>
</div>
<script>
// setup galleries
mediaDivs = document.querySelectorAll('.media-div');
mediaDivs.forEach(div => {
images = Array.from(div.querySelectorAll('img'));
if (images.length > 1) {
buttonSpan = div.querySelector('.button-wrapper:first-of-type');
images.forEach(image => {
image.addEventListener('click', (e) => {
if (e.offsetX > image.offsetWidth * 2 / 3) {
// scroll right
div = e.target.closest('.media-div');
images = Array.from(div.querySelectorAll('img'));
currentIndex = images.indexOf(e.target)
if (currentIndex < (images.length -1)) {
buttons = Array.from(div.querySelectorAll('button'));
images[currentIndex].style.display = "none";
images[currentIndex+1].style.display = "block";
buttons[currentIndex].classList.remove('selected');
buttons[currentIndex+1].classList.add('selected');
} else {
e.target.classList.toggle('invert');
}
} else if (e.offsetX < image.offsetWidth / 3) {
// scroll left
div = e.target.closest('.media-div');
images = Array.from(div.querySelectorAll('img'));
currentIndex = images.indexOf(e.target)
if (currentIndex > 0) {
buttons = Array.from(div.querySelectorAll('button'));
images[currentIndex].style.display = "none";
images[currentIndex-1].style.display = "block";
buttons[currentIndex].classList.remove('selected');
buttons[currentIndex-1].classList.add('selected');
} else {
e.target.classList.toggle('invert');
}
} else {
image.classList.toggle('invert');
}
});
});
firstImage = images.shift();
firstButton = document.createElement('button');
firstButton.classList.add('gallery');
firstButton.classList.add('selected');
buttonSpan.appendChild(firstButton);
images.forEach(image => {
image.style.display = "none";
button = document.createElement('button');
button.classList.add('gallery');
buttonSpan.appendChild(button);
});
} else {
images.forEach(image => {
image.addEventListener('click', () => {
image.classList.toggle('invert');
});
});
}
});
// main button code
function hide(button, permalink){
const div = button.closest('.post');
div.scrollTo({
top: 0,
behavior: 'smooth'
});
div.remove();
try {
fetch('/hide' + permalink);
} catch (error) {
console.error('Could hide', error);
}
}
function comments(permalink){
window.open("https://reddit.com" + permalink, '_blank');
}
// text expand code
function checkHeight(){
const divs = document.querySelectorAll('.text-content');
divs.forEach(div => {
height = div.offsetHeight;
style = window.getComputedStyle(div);
maxHeight = parseInt(style.maxHeight);
if (height < maxHeight) {
div.classList.add('expanded');
}
});
}
function expand(div) {
div.classList.add('expanded');
}
window.addEventListener('load', (event) => {
checkHeight()
});
window.addEventListener('resize', (event) => {
checkHeight()
});
// audio/video sync code
function findAudio(video){
const div = video.closest('.post');
return div.querySelector('audio:first-of-type');
}
function playAudio(video){
audio = findAudio(video);
if (audio) {
audio.play();
audio.currentTime = video.currentTime;
}
}
function pauseAudio(video){
audio = findAudio(video);
if (audio) {
audio.pause();
audio.currentTime = video.currentTime;
}
}
function seekAudio(video){
audio = findAudio(video);
if (audio) {
audio.currentTime = video.currentTime;
}
}
</script>
</body>
</html>

26
compose.yaml Executable file
View File

@ -0,0 +1,26 @@
services:
ofelia:
image: mcuadros/ofelia:latest
restart: unless-stopped
depends_on:
- web
command: daemon --docker
volumes:
- /var/run/docker.sock:/var/run/docker.sock:ro
web:
build:
context: app
target: builder
restart: unless-stopped
stop_signal: SIGINT
ports:
- '8001:8000'
volumes:
- ./db:/reddit/db
- ./media:/reddit/media
labels:
ofelia.enabled: "true"
ofelia.job-exec.scrape.schedule: "@every 60m"
ofelia.job-exec.scrape.command: "python3 /app/scrape_posts.py"
ofelia.job-exec.clean.schedule: "@every 24h"
ofelia.job-exec.clean.command: "python3 /app/delete_posts.py"