Initial Release
This commit is contained in:
commit
6609e7253b
|
|
@ -0,0 +1,5 @@
|
|||
media/
|
||||
db/
|
||||
**/YARS.log
|
||||
**/__pycache__/
|
||||
**/.venv/
|
||||
|
|
@ -0,0 +1,28 @@
|
|||
### Places to configure
|
||||
|
||||
## ./compose.yaml
|
||||
|
||||
You can change the host port, host volume directories, how often reddit is scanned for new data, and how often old data is removed.
|
||||
|
||||
## ./app/config.py
|
||||
|
||||
You can change how much data is pulled, from where, the minimum score to save it to your DB, and how long it is retained.
|
||||
|
||||
### Startup
|
||||
|
||||
```
|
||||
docker compose build
|
||||
docker compose up
|
||||
```
|
||||
|
||||
The DB is created automatically. You will want to run
|
||||
|
||||
```
|
||||
docker exec -it reddit-web-1 sh -c "python3 /app/scrape_posts.py"
|
||||
```
|
||||
|
||||
to populate the DB with initial data, or you will have to wait for the scheduled task to get triggered for the web page to be usable.
|
||||
|
||||
### Thanks
|
||||
|
||||
This wouldn't be possible without https://github.com/datavorous/yars, which has been yoinked and tweaked to handle some more complex media cases. I will port my knowledge back to the project when time permits.
|
||||
|
|
@ -0,0 +1,27 @@
|
|||
# syntax=docker/dockerfile:1.4
|
||||
FROM --platform=$BUILDPLATFORM python:3.10-alpine AS builder
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
COPY requirements.txt /app
|
||||
RUN --mount=type=cache,target=/root/.cache/pip \
|
||||
pip3 install -r requirements.txt
|
||||
|
||||
COPY . /app
|
||||
|
||||
ENTRYPOINT ["python3"]
|
||||
CMD ["app.py"]
|
||||
|
||||
FROM builder as dev-envs
|
||||
|
||||
RUN <<EOF
|
||||
apk update
|
||||
apk add git
|
||||
EOF
|
||||
|
||||
RUN <<EOF
|
||||
addgroup -S docker
|
||||
adduser -S --shell /bin/bash --ingroup docker vscode
|
||||
EOF
|
||||
# install Docker tools (cli, buildx, compose)
|
||||
COPY --from=gloursdocker/docker / /
|
||||
|
|
@ -0,0 +1,162 @@
|
|||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
# For a library or package, you might want to ignore these files since the code is
|
||||
# intended to run in multiple environments; otherwise, check them in:
|
||||
# .python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# poetry
|
||||
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||
#poetry.lock
|
||||
|
||||
# pdm
|
||||
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||
#pdm.lock
|
||||
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||
# in version control.
|
||||
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
||||
.pdm.toml
|
||||
.pdm-python
|
||||
.pdm-build/
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# pytype static type analyzer
|
||||
.pytype/
|
||||
|
||||
# Cython debug symbols
|
||||
cython_debug/
|
||||
|
||||
# PyCharm
|
||||
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
#.idea/
|
||||
|
|
@ -0,0 +1,41 @@
|
|||
ci:
|
||||
autofix_commit_msg: "[pre-commit.ci] auto fixes from pre-commit.com hooks"
|
||||
autofix_prs: true
|
||||
autoupdate_commit_msg: "[pre-commit.ci] pre-commit autoupdate"
|
||||
autoupdate_schedule: quarterly
|
||||
submodules: false
|
||||
repos:
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
rev: v5.0.0
|
||||
hooks:
|
||||
- id: trailing-whitespace
|
||||
- id: end-of-file-fixer
|
||||
- id: check-ast
|
||||
- id: check-yaml
|
||||
- id: check-toml
|
||||
- id: check-merge-conflict
|
||||
- id: mixed-line-ending
|
||||
- id: check-case-conflict
|
||||
- id: sort-simple-yaml
|
||||
files: .pre-commit-config.yaml
|
||||
- repo: https://github.com/hadialqattan/pycln
|
||||
rev: v2.4.0
|
||||
hooks:
|
||||
- id: pycln
|
||||
args: [--config=pyproject.toml, src]
|
||||
types: [file]
|
||||
types_or: [python, pyi]
|
||||
- repo: https://github.com/psf/black-pre-commit-mirror
|
||||
rev: 24.8.0
|
||||
hooks:
|
||||
- id: black
|
||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||
rev: v0.6.9
|
||||
hooks:
|
||||
- id: ruff
|
||||
types: [file]
|
||||
types_or: [python, pyi, toml]
|
||||
- repo: https://github.com/codespell-project/codespell
|
||||
rev: v2.3.0
|
||||
hooks:
|
||||
- id: codespell
|
||||
|
|
@ -0,0 +1,21 @@
|
|||
MIT License
|
||||
|
||||
Copyright (c) 2024 Sagnik Bhattacharjee
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
|
|
@ -0,0 +1,172 @@
|
|||
<div align="center">
|
||||
|
||||
<img src="logo.svg" width="10%">
|
||||
|
||||
# YARS (Yet Another Reddit Scraper)
|
||||
|
||||
[](https://github.com/datavorous/yars/stargazers)<br>
|
||||
|
||||
</div>
|
||||
|
||||
YARS is a Python package designed to simplify the process of scraping Reddit for posts, comments, user data, and other media. The package also includes utility functions. It is built using **Python** and relies on the **requests** module for fetching data from Reddit’s public API. The scraper uses simple `.json` requests, avoiding the need for official Reddit API keys, making it lightweight and easy to use.
|
||||
|
||||
## Features
|
||||
|
||||
- **Reddit Search**: Search Reddit for posts using a keyword query.
|
||||
- **Post Scraping**: Scrape post details, including title, body, and comments.
|
||||
- **User Data Scraping**: Fetch recent activity (posts and comments) of a Reddit user.
|
||||
- **Subreddit Posts Fetching**: Retrieve posts from specific subreddits with flexible options for category and time filters.
|
||||
- **Image Downloading**: Download images from posts.
|
||||
- **Results Display**: Utilize `Pygments` for colorful display of JSON-formatted results.
|
||||
|
||||
> [!WARNING]
|
||||
> Use with rotating proxies, or Reddit might gift you with an IP ban.
|
||||
> I could extract max 2552 posts at once from 'r/all' using this.
|
||||
> [Here](https://files.catbox.moe/zdra2i.json) is a **7.1 MB JSON** file containing the top 100 posts from 'r/nosleep', which included post titles, body text, all comments and their replies, post scores, time of upload etc.
|
||||
|
||||
## Dependencies
|
||||
|
||||
- `requests`
|
||||
- `Pygments`
|
||||
|
||||
## Installation
|
||||
|
||||
1. Clone the repository:
|
||||
|
||||
```
|
||||
git clone https://github.com/datavorous/YARS.git
|
||||
```
|
||||
Navigate inside the ```src``` folder.
|
||||
|
||||
2. Install ```uv``` (if not already installed):
|
||||
|
||||
```
|
||||
pip install uv
|
||||
```
|
||||
|
||||
3. Run the application:
|
||||
```
|
||||
uv run example/example.py
|
||||
```
|
||||
It'll setup the virtual env, install the necessary packages and run the ```example.py``` program.
|
||||
|
||||
## Usage
|
||||
|
||||
We will use the following Python script to demonstrate the functionality of the scraper. The script includes:
|
||||
|
||||
- Searching Reddit
|
||||
- Scraping post details
|
||||
- Fetching user data
|
||||
- Retrieving subreddit posts
|
||||
- Downloading images from posts
|
||||
|
||||
#### Code Overview
|
||||
|
||||
```python
|
||||
from yars import YARS
|
||||
from utils import display_results, download_image
|
||||
|
||||
miner = YARS()
|
||||
```
|
||||
|
||||
#### Step 1: Searching Reddit
|
||||
|
||||
The `search_reddit` method allows you to search Reddit using a query string. Here, we search for posts containing "OpenAI" and limit the results to 3 posts. The `display_results` function is used to present the results in a formatted way.
|
||||
|
||||
```python
|
||||
search_results = miner.search_reddit("OpenAI", limit=3)
|
||||
display_results(search_results, "SEARCH")
|
||||
```
|
||||
|
||||
#### Step 2: Scraping Post Details
|
||||
|
||||
Next, we scrape details of a specific Reddit post by passing its permalink. If the post details are successfully retrieved, they are displayed using `display_results`. Otherwise, an error message is printed.
|
||||
|
||||
```python
|
||||
permalink = "https://www.reddit.com/r/getdisciplined/comments/1frb5ib/what_single_health_test_or_practice_has/".split('reddit.com')[1]
|
||||
post_details = miner.scrape_post_details(permalink)
|
||||
if post_details:
|
||||
display_results(post_details, "POST DATA")
|
||||
else:
|
||||
print("Failed to scrape post details.")
|
||||
```
|
||||
|
||||
#### Step 3: Fetching User Data
|
||||
|
||||
We can also retrieve a Reddit user’s recent activity (posts and comments) using the `scrape_user_data` method. Here, we fetch data for the user `iamsecb` and limit the results to 2 items.
|
||||
|
||||
```python
|
||||
user_data = miner.scrape_user_data("iamsecb", limit=2)
|
||||
display_results(user_data, "USER DATA")
|
||||
```
|
||||
|
||||
#### Step 4: Fetching Subreddit Posts
|
||||
|
||||
The `fetch_subreddit_posts` method retrieves posts from a specified subreddit. In this example, we fetch 11 top posts from the "generative" subreddit from the past week.
|
||||
|
||||
```python
|
||||
subreddit_posts = miner.fetch_subreddit_posts("generative", limit=11, category="top", time_filter="week")
|
||||
display_results(subreddit_posts, "EarthPorn SUBREDDIT New Posts")
|
||||
```
|
||||
|
||||
#### Step 5: Downloading Images
|
||||
|
||||
For the posts retrieved from the subreddit, we try to download their associated images. The `download_image` function is used for this. If the post doesn't have an `image_url`, the thumbnail URL is used as a fallback.
|
||||
|
||||
```python
|
||||
for z in range(3):
|
||||
try:
|
||||
image_url = subreddit_posts[z]["image_url"]
|
||||
except:
|
||||
image_url = subreddit_posts[z]["thumbnail_url"]
|
||||
download_image(image_url)
|
||||
```
|
||||
|
||||
### Complete Code Example
|
||||
|
||||
```python
|
||||
from yars import YARS
|
||||
from utils import display_results, download_image
|
||||
|
||||
miner = YARS()
|
||||
|
||||
# Search for posts related to "OpenAI"
|
||||
search_results = miner.search_reddit("OpenAI", limit=3)
|
||||
display_results(search_results, "SEARCH")
|
||||
|
||||
# Scrape post details using its permalink
|
||||
permalink = "https://www.reddit.com/r/getdisciplined/comments/1frb5ib/what_single_health_test_or_practice_has/".split('reddit.com')[1]
|
||||
post_details = miner.scrape_post_details(permalink)
|
||||
if post_details:
|
||||
display_results(post_details, "POST DATA")
|
||||
else:
|
||||
print("Failed to scrape post details.")
|
||||
|
||||
# Fetch recent activity of user "iamsecb"
|
||||
user_data = miner.scrape_user_data("iamsecb", limit=2)
|
||||
display_results(user_data, "USER DATA")
|
||||
|
||||
# Fetch top posts from the subreddit "generative" from the past week
|
||||
subreddit_posts = miner.fetch_subreddit_posts("generative", limit=11, category="top", time_filter="week")
|
||||
display_results(subreddit_posts, "EarthPorn SUBREDDIT New Posts")
|
||||
|
||||
# Download images from the fetched posts
|
||||
for z in range(3):
|
||||
try:
|
||||
image_url = subreddit_posts[z]["image_url"]
|
||||
except:
|
||||
image_url = subreddit_posts[z]["thumbnail_url"]
|
||||
download_image(image_url)
|
||||
```
|
||||
|
||||
You can now use these techniques to explore and scrape data from Reddit programmatically.
|
||||
|
||||
## Contributing
|
||||
|
||||
Contributions are welcome! For feature requests, bug reports, or questions, please open an issue. If you would like to contribute code, please open a pull request with your changes.
|
||||
|
||||
### Our Notable Contributors
|
||||
|
||||
<a href="https://github.com/datavorous/yars/graphs/contributors">
|
||||
<img src="https://contrib.rocks/image?repo=datavorous/yars" />
|
||||
|
||||
|
|
@ -0,0 +1,4 @@
|
|||
<svg width="217" height="229" viewBox="0 0 217 229" fill="none" xmlns="http://www.w3.org/2000/svg">
|
||||
<path d="M206.56 195.464C189.638 211.477 168.078 222.382 144.606 226.8C121.134 231.218 96.8052 228.95 74.6953 220.284C52.5854 211.618 33.6878 196.942 20.3922 178.113C7.09653 159.283 0 137.146 0 114.5C0 91.854 7.09653 69.7166 20.3922 50.8872C33.6878 32.0578 52.5855 17.382 74.6953 8.71579C96.8052 0.0495596 121.134 -2.21792 144.606 2.2001C168.078 6.61811 189.638 17.5232 206.56 33.5363L179.384 59.2528C167.836 48.3259 153.124 40.8846 137.108 37.8699C121.092 34.8552 104.49 36.4025 89.4031 42.316C74.316 48.2296 61.4208 58.2439 52.3483 71.0925C43.2757 83.9412 38.4332 99.0471 38.4332 114.5C38.4332 129.953 43.2757 145.059 52.3483 157.907C61.4208 170.756 74.316 180.77 89.4031 186.684C104.49 192.598 121.092 194.145 137.108 191.13C153.124 188.115 167.836 180.674 179.384 169.747L206.56 195.464Z" fill="#FB471A"/>
|
||||
<path d="M52.2642 51.0675C65.76 38.6206 82.9546 30.1442 101.674 26.7101C120.393 23.276 139.796 25.0385 157.429 31.7747C175.062 38.5109 190.133 49.9183 200.737 64.5543C211.34 79.1902 217 96.3975 217 114C217 131.603 211.34 148.81 200.737 163.446C190.133 178.082 175.062 189.489 157.429 196.225C139.796 202.961 120.393 204.724 101.674 201.29C82.9546 197.856 65.7599 189.379 52.2642 176.932L80.2341 151.136C88.1979 158.481 98.3445 163.483 109.391 165.51C120.437 167.536 131.886 166.496 142.292 162.521C152.697 158.546 161.591 151.815 167.848 143.178C174.105 134.541 177.445 124.387 177.445 114C177.445 103.613 174.105 93.4588 167.848 84.8221C161.591 76.1854 152.697 69.454 142.292 65.4789C131.886 61.5039 120.437 60.4639 109.391 62.4903C98.3445 64.5168 88.1979 69.5187 80.2341 76.8636L52.2642 51.0675Z" fill="#FFBA4A"/>
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 1.7 KiB |
|
|
@ -0,0 +1,12 @@
|
|||
[project]
|
||||
name = "sm"
|
||||
version = "0.1.0"
|
||||
description = "Add your description here"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.13"
|
||||
dependencies = [
|
||||
"flask>=3.0.3",
|
||||
"meta-ai-api>=1.2.1",
|
||||
"pygments>=2.18.0",
|
||||
"requests>=2.32.3",
|
||||
]
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,15 @@
|
|||
from requests import Session
|
||||
|
||||
from .agents import get_agent
|
||||
|
||||
|
||||
class RandomUserAgentSession(Session):
|
||||
"""
|
||||
Session class (inherited from requests.Session) which passes
|
||||
a random user agent with each request
|
||||
"""
|
||||
|
||||
def request(self, *args, **kwargs):
|
||||
self.headers.update({"User-Agent": get_agent()})
|
||||
|
||||
return super().request(*args, **kwargs)
|
||||
|
|
@ -0,0 +1,96 @@
|
|||
import os
|
||||
import csv
|
||||
import json
|
||||
import logging
|
||||
import requests
|
||||
from urllib.parse import urlparse
|
||||
from pygments import formatters, highlight, lexers
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO, filename="YARS.log", format="%(asctime)s - %(message)s"
|
||||
)
|
||||
|
||||
|
||||
def display_results(results, title):
|
||||
|
||||
try:
|
||||
print(f"\n{'-'*20} {title} {'-'*20}")
|
||||
|
||||
if isinstance(results, list):
|
||||
for item in results:
|
||||
if isinstance(item, dict):
|
||||
formatted_json = json.dumps(item, sort_keys=True, indent=4)
|
||||
colorful_json = highlight(
|
||||
formatted_json,
|
||||
lexers.JsonLexer(),
|
||||
formatters.TerminalFormatter(),
|
||||
)
|
||||
print(colorful_json)
|
||||
else:
|
||||
print(item)
|
||||
elif isinstance(results, dict):
|
||||
formatted_json = json.dumps(results, sort_keys=True, indent=4)
|
||||
colorful_json = highlight(
|
||||
formatted_json, lexers.JsonLexer(), formatters.TerminalFormatter()
|
||||
)
|
||||
print(colorful_json)
|
||||
else:
|
||||
logging.warning(
|
||||
"No results to display: expected a list or dictionary, got %S",
|
||||
type(results),
|
||||
)
|
||||
print("No results to display.")
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"Error displaying results: {e}")
|
||||
print("Error displaying results.")
|
||||
|
||||
|
||||
def download_image(image_url, output_folder="images", session=None):
|
||||
|
||||
os.makedirs(output_folder, exist_ok=True)
|
||||
|
||||
filename = urlparse(image_url).path
|
||||
if filename[0]=='/':
|
||||
filename = filename[1:]
|
||||
filepath = os.path.join(output_folder, filename)
|
||||
os.makedirs(filepath[:filepath.rfind('/')], exist_ok=True)
|
||||
|
||||
if session is None:
|
||||
session = requests.Session()
|
||||
|
||||
try:
|
||||
response = session.get(image_url, stream=True)
|
||||
response.raise_for_status()
|
||||
with open(filepath, "wb") as f:
|
||||
for chunk in response.iter_content(8192):
|
||||
f.write(chunk)
|
||||
logging.info("Downloaded: %s", filepath)
|
||||
return filepath
|
||||
except requests.RequestException as e:
|
||||
logging.error("Failed to download %s: %s", image_url, e)
|
||||
return None
|
||||
except Exception as e:
|
||||
logging.error("An error occurred while saving the image: %s", e)
|
||||
return None
|
||||
|
||||
|
||||
def export_to_json(data, filename="output.json"):
|
||||
try:
|
||||
with open(filename, "w", encoding="utf-8") as json_file:
|
||||
json.dump(data, json_file, indent=4)
|
||||
print(f"Data successfully exported to {filename}")
|
||||
except Exception as e:
|
||||
print(f"Error exporting to JSON: {e}")
|
||||
|
||||
|
||||
def export_to_csv(data, filename="output.csv"):
|
||||
try:
|
||||
keys = data[0].keys()
|
||||
with open(filename, "w", newline="", encoding="utf-8") as output_file:
|
||||
dict_writer = csv.DictWriter(output_file, fieldnames=keys)
|
||||
dict_writer.writeheader()
|
||||
dict_writer.writerows(data)
|
||||
print(f"Data successfully exported to {filename}")
|
||||
except Exception as e:
|
||||
print(f"Error exporting to CSV: {e}")
|
||||
|
|
@ -0,0 +1,309 @@
|
|||
from __future__ import annotations
|
||||
from .sessions import RandomUserAgentSession
|
||||
import time
|
||||
import random
|
||||
import logging
|
||||
import re
|
||||
import requests
|
||||
from urllib3.util.retry import Retry
|
||||
from requests.adapters import HTTPAdapter
|
||||
|
||||
logger = logging.basicConfig(
|
||||
filename="YARS.log",
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(levelname)s - %(message)s",
|
||||
)
|
||||
|
||||
|
||||
class YARS:
|
||||
__slots__ = ("headers", "session", "proxy", "timeout")
|
||||
|
||||
def __init__(self, proxy=None, timeout=10, random_user_agent=True):
|
||||
self.session = RandomUserAgentSession() if random_user_agent else requests.Session()
|
||||
self.proxy = proxy
|
||||
self.timeout = timeout
|
||||
|
||||
retries = Retry(
|
||||
total=5,
|
||||
backoff_factor=2, # Exponential backoff
|
||||
status_forcelist=[429, 500, 502, 503, 504],
|
||||
)
|
||||
|
||||
self.session.mount("https://", HTTPAdapter(max_retries=retries))
|
||||
|
||||
if proxy:
|
||||
self.session.proxies.update({"http": proxy, "https": proxy})
|
||||
def handle_search(self,url, params, after=None, before=None):
|
||||
if after:
|
||||
params["after"] = after
|
||||
if before:
|
||||
params["before"] = before
|
||||
|
||||
try:
|
||||
response = self.session.get(url, params=params, timeout=self.timeout)
|
||||
response.raise_for_status()
|
||||
logging.info("Search request successful")
|
||||
except Exception as e:
|
||||
if response.status_code != 200:
|
||||
logging.info("Search request unsuccessful due to: %s", e)
|
||||
print(f"Failed to fetch search results: {response.status_code}")
|
||||
return []
|
||||
|
||||
data = response.json()
|
||||
results = []
|
||||
for post in data["data"]["children"]:
|
||||
post_data = post["data"]
|
||||
results.append(
|
||||
{
|
||||
"title": post_data["title"],
|
||||
"link": f"https://www.reddit.com{post_data['permalink']}",
|
||||
"description": post_data.get("selftext", "")[:269],
|
||||
}
|
||||
)
|
||||
logging.info("Search Results Retrned %d Results", len(results))
|
||||
return results
|
||||
def search_reddit(self, query, limit=10, after=None, before=None):
|
||||
url = "https://www.reddit.com/search.json"
|
||||
params = {"q": query, "limit": limit, "sort": "relevance", "type": "link"}
|
||||
return self.handle_search(url, params, after, before)
|
||||
def search_subreddit(self, subreddit, query, limit=10, after=None, before=None, sort="relevance"):
|
||||
url = f"https://www.reddit.com/r/{subreddit}/search.json"
|
||||
params = {"q": query, "limit": limit, "sort": "relevance", "type": "link","restrict_sr":"on"}
|
||||
return self.handle_search(url, params, after, before)
|
||||
|
||||
def scrape_post_details(self, permalink):
|
||||
url = f"https://www.reddit.com{permalink}.json"
|
||||
|
||||
try:
|
||||
response = self.session.get(url, timeout=self.timeout)
|
||||
response.raise_for_status()
|
||||
logging.info("Post details request successful : %s", url)
|
||||
except Exception as e:
|
||||
logging.info("Post details request unsccessful: %e", e)
|
||||
if response.status_code != 200:
|
||||
print(f"Failed to fetch post data: {response.status_code}")
|
||||
return None
|
||||
|
||||
post_data = response.json()
|
||||
if not isinstance(post_data, list) or len(post_data) < 2:
|
||||
logging.info("Unexpected post data structre")
|
||||
print("Unexpected post data structure")
|
||||
return None
|
||||
|
||||
main_post = post_data[0]["data"]["children"][0]["data"]
|
||||
title = main_post["title"]
|
||||
body = main_post.get("selftext", "")
|
||||
|
||||
comments = self._extract_comments(post_data[1]["data"]["children"])
|
||||
logging.info("Successfully scraped post: %s", title)
|
||||
return {"title": title, "body": body, "comments": comments}
|
||||
|
||||
def _extract_comments(self, comments):
|
||||
logging.info("Extracting comments")
|
||||
extracted_comments = []
|
||||
for comment in comments:
|
||||
if isinstance(comment, dict) and comment.get("kind") == "t1":
|
||||
comment_data = comment.get("data", {})
|
||||
extracted_comment = {
|
||||
"author": comment_data.get("author", ""),
|
||||
"body": comment_data.get("body", ""),
|
||||
"score": comment_data.get("score",""),
|
||||
"replies": [],
|
||||
}
|
||||
|
||||
replies = comment_data.get("replies", "")
|
||||
if isinstance(replies, dict):
|
||||
extracted_comment["replies"] = self._extract_comments(
|
||||
replies.get("data", {}).get("children", [])
|
||||
)
|
||||
extracted_comments.append(extracted_comment)
|
||||
logging.info("Successfully extracted comments")
|
||||
return extracted_comments
|
||||
|
||||
def scrape_user_data(self, username, limit=10):
|
||||
logging.info("Scraping user data for %s, limit: %d", username, limit)
|
||||
base_url = f"https://www.reddit.com/user/{username}/.json"
|
||||
params = {"limit": limit, "after": None}
|
||||
all_items = []
|
||||
count = 0
|
||||
|
||||
while count < limit:
|
||||
try:
|
||||
response = self.session.get(
|
||||
base_url, params=params, timeout=self.timeout
|
||||
)
|
||||
response.raise_for_status()
|
||||
logging.info("User data request successful")
|
||||
except Exception as e:
|
||||
logging.info("User data request unsuccessful: %s", e)
|
||||
if response.status_code != 200:
|
||||
print(
|
||||
f"Failed to fetch data for user {username}: {response.status_code}"
|
||||
)
|
||||
break
|
||||
try:
|
||||
data = response.json()
|
||||
except ValueError:
|
||||
print(f"Failed to parse JSON response for user {username}.")
|
||||
break
|
||||
|
||||
if "data" not in data or "children" not in data["data"]:
|
||||
print(
|
||||
f"No 'data' or 'children' field found in response for user {username}."
|
||||
)
|
||||
logging.info("No 'data' or 'children' field found in response")
|
||||
break
|
||||
|
||||
items = data["data"]["children"]
|
||||
if not items:
|
||||
print(f"No more items found for user {username}.")
|
||||
logging.info("No more items found for user")
|
||||
break
|
||||
|
||||
for item in items:
|
||||
kind = item["kind"]
|
||||
item_data = item["data"]
|
||||
if kind == "t3":
|
||||
post_url = f"https://www.reddit.com{item_data.get('permalink', '')}"
|
||||
all_items.append(
|
||||
{
|
||||
"type": "post",
|
||||
"title": item_data.get("title", ""),
|
||||
"subreddit": item_data.get("subreddit", ""),
|
||||
"url": post_url,
|
||||
"created_utc": item_data.get("created_utc", ""),
|
||||
}
|
||||
)
|
||||
elif kind == "t1":
|
||||
comment_url = (
|
||||
f"https://www.reddit.com{item_data.get('permalink', '')}"
|
||||
)
|
||||
all_items.append(
|
||||
{
|
||||
"type": "comment",
|
||||
"subreddit": item_data.get("subreddit", ""),
|
||||
"body": item_data.get("body", ""),
|
||||
"created_utc": item_data.get("created_utc", ""),
|
||||
"url": comment_url,
|
||||
}
|
||||
)
|
||||
count += 1
|
||||
if count >= limit:
|
||||
break
|
||||
|
||||
params["after"] = data["data"].get("after")
|
||||
if not params["after"]:
|
||||
break
|
||||
|
||||
time.sleep(random.uniform(1, 2))
|
||||
logging.info("Sleeping for random time")
|
||||
|
||||
logging.info("Successfully scraped user data for %s", username)
|
||||
return all_items
|
||||
|
||||
def fetch_subreddit_posts(
|
||||
self, subreddit, limit=10, category="hot", time_filter="all"
|
||||
):
|
||||
logging.info(
|
||||
"Fetching subreddit/user posts for %s, limit: %d, category: %s, time_filter: %s",
|
||||
subreddit,
|
||||
limit,
|
||||
category,
|
||||
time_filter,
|
||||
)
|
||||
if category not in ["hot", "top", "new", "userhot", "usertop", "usernew"]:
|
||||
raise ValueError("Category for Subredit must be either 'hot', 'top', or 'new' or for User must be 'userhot', 'usertop', or 'usernew'")
|
||||
|
||||
batch_size = min(100, limit)
|
||||
total_fetched = 0
|
||||
after = None
|
||||
all_posts = []
|
||||
|
||||
while total_fetched < limit:
|
||||
if category == "hot":
|
||||
url = f"https://www.reddit.com/r/{subreddit}/hot.json"
|
||||
elif category == "top":
|
||||
url = f"https://www.reddit.com/r/{subreddit}/top.json"
|
||||
elif category == "new":
|
||||
url = f"https://www.reddit.com/r/{subreddit}/new.json"
|
||||
elif category == "userhot":
|
||||
url = f"https://www.reddit.com/user/{subreddit}/submitted/hot.json"
|
||||
elif category == "usertop":
|
||||
url = f"https://www.reddit.com/user/{subreddit}/submitted/top.json"
|
||||
else:
|
||||
url = f"https://www.reddit.com/user/{subreddit}/submitted/new.json"
|
||||
|
||||
params = {
|
||||
"limit": batch_size,
|
||||
"after": after,
|
||||
"raw_json": 1,
|
||||
"t": time_filter,
|
||||
}
|
||||
try:
|
||||
response = self.session.get(url, params=params, timeout=self.timeout)
|
||||
response.raise_for_status()
|
||||
logging.info("Subreddit/user posts request successful")
|
||||
except Exception as e:
|
||||
logging.info("Subreddit/user posts request unsuccessful: %s", e)
|
||||
if response.status_code != 200:
|
||||
print(
|
||||
f"Failed to fetch posts for subreddit/user {subreddit}: {response.status_code}"
|
||||
)
|
||||
break
|
||||
|
||||
data = response.json()
|
||||
posts = data["data"]["children"]
|
||||
if not posts:
|
||||
break
|
||||
|
||||
for post in posts:
|
||||
post_data = post["data"]
|
||||
post_info = {
|
||||
"title": post_data["title"],
|
||||
"author": post_data["author"],
|
||||
"permalink": post_data["permalink"],
|
||||
"score": post_data["score"],
|
||||
"num_comments": post_data["num_comments"],
|
||||
"created_utc": post_data["created_utc"],
|
||||
}
|
||||
if "selftext" in post_data:
|
||||
body = post_data["selftext"]
|
||||
if body != None and len(body)>0:
|
||||
post_info["body"] = body
|
||||
if "media_metadata" in post_data:
|
||||
media_urls = []
|
||||
for image in post_data["media_metadata"]:
|
||||
if "m" not in post_data["media_metadata"][image]:
|
||||
continue
|
||||
content_type = post_data["media_metadata"][image]["m"]
|
||||
extension = content_type[content_type.find('/')+1:]
|
||||
media_urls.append("https://i.redd.it/{}.{}".format(image, extension))
|
||||
post_info["media_urls"] = media_urls
|
||||
elif "media" in post_data and post_data["media"] is not None and "reddit_video" in post_data["media"]:
|
||||
media_url = post_data["media"]["reddit_video"]["fallback_url"]
|
||||
video_url = media_url[:media_url.find('?')]
|
||||
audio_url = video_url[:video_url.rfind('/')] + "/CMAF_AUDIO_128.mp4"
|
||||
post_info["media_urls"] = [video_url, audio_url]
|
||||
elif "url" in post_data:
|
||||
url = post_data["url"]
|
||||
if re.fullmatch(r"https:\/\/i\.redd\.it\/.{1,20}", url):
|
||||
post_info["media_urls"] = [url]
|
||||
elif "body" not in post_info:
|
||||
post_info["body"] = url
|
||||
if "thumbnail" in post_data and post_data["thumbnail"] != "self":
|
||||
post_info["thumbnail_url"] = post_data["thumbnail"]
|
||||
|
||||
all_posts.append(post_info)
|
||||
total_fetched += 1
|
||||
if total_fetched >= limit:
|
||||
break
|
||||
|
||||
after = data["data"].get("after")
|
||||
if not after:
|
||||
break
|
||||
|
||||
time.sleep(random.uniform(1, 2))
|
||||
logging.info("Sleeping for random time")
|
||||
|
||||
logging.info("Successfully fetched subreddit posts for %s", subreddit)
|
||||
return all_posts
|
||||
|
|
@ -0,0 +1,87 @@
|
|||
from flask import Flask, send_file, render_template, request
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import config
|
||||
import json
|
||||
import sqlite3
|
||||
import subprocess
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
@app.route('/file/<path:filename>')
|
||||
def serve_file(filename):
|
||||
try:
|
||||
return send_file('{0}/{1}'.format(config.media_dir, filename), as_attachment=False)
|
||||
except FileNotFoundError:
|
||||
return "File not found", 404
|
||||
|
||||
@app.route('/hide/<path:permalink>')
|
||||
def hide_post(permalink):
|
||||
if permalink[0] != "/":
|
||||
permalink = "/" + permalink
|
||||
connection = sqlite3.connect(config.db_file)
|
||||
cursor = connection.cursor()
|
||||
update = "UPDATE post SET hidden = ? WHERE permalink = ?"
|
||||
binds = [True, permalink]
|
||||
print(update)
|
||||
print(binds)
|
||||
cursor.execute(update,binds)
|
||||
connection.commit()
|
||||
connection.close()
|
||||
return ""
|
||||
|
||||
@app.route('/', methods=['GET', 'POST'])
|
||||
def index():
|
||||
connection = sqlite3.connect(config.db_file)
|
||||
cursor = connection.cursor()
|
||||
select = "SELECT subreddit, count(*) count FROM post WHERE hidden = ? GROUP BY subreddit ORDER BY count desc LIMIT 1"
|
||||
binds = [False]
|
||||
row = cursor.execute(select, binds).fetchone()
|
||||
connection.close()
|
||||
return get_subreddit(row[0])
|
||||
|
||||
@app.route('/r/<path:subreddit>')
|
||||
def get_subreddit(subreddit):
|
||||
title = f"/r/{subreddit}"
|
||||
connection = sqlite3.connect(config.db_file)
|
||||
cursor = connection.cursor()
|
||||
select = "SELECT subreddit, count FROM (SELECT subreddit, count(*) count FROM post WHERE hidden = ? GROUP BY subreddit ORDER BY count desc) t WHERE count > 0"
|
||||
binds = [False]
|
||||
results = cursor.execute(select, binds).fetchall()
|
||||
subreddits = [f"/r/{sub[0]}" for sub in results]
|
||||
count = results[subreddits.index(title)][1]
|
||||
select = "SELECT post FROM post WHERE subreddit = ? and hidden = ? ORDER BY score desc LIMIT ?"
|
||||
binds = [subreddit, False, config.posts_per_page_load]
|
||||
results = cursor.execute(select, binds).fetchall()
|
||||
posts = [json.loads(post[0]) for post in results]
|
||||
add_media_html_to_posts(posts)
|
||||
connection.close()
|
||||
return render_template('index.html', title=title, count=count, posts=posts, subreddits=subreddits)
|
||||
|
||||
def add_media_html_to_posts(posts):
|
||||
for post_index, post in enumerate(posts):
|
||||
media_html = []
|
||||
for media_index, media in enumerate(post["media_urls"]):
|
||||
filename = urlparse(media).path
|
||||
if filename[0]=='/':
|
||||
filename = filename[1:]
|
||||
html = get_media_html(filename, True if (post_index < 3 and media_index == 0) else False)
|
||||
media_html.append(html)
|
||||
post["media_html"] = media_html
|
||||
|
||||
|
||||
def get_media_html(file, priority=False):
|
||||
if file.endswith('.jpg') or file.endswith('.jpeg') or file.endswith('.png') or file.endswith('.gif'):
|
||||
return '<img class="invertible" src="/file/{0}" {1}>'.format(file, 'fetchpriority="high" loading="eager"' if priority else '')
|
||||
if file.find("_AUDIO_")>0:
|
||||
return '<audio src="/file/{0}" hidden></audio>'.format(file)
|
||||
if file.endswith('.mp4'):
|
||||
return '<video src="/file/{0}" type="video/mp4" onplay="playAudio(this)" onpause="pauseAudio(this)" onseeked="seekAudio(this)" controls></video>'.format(file)
|
||||
if file.endswith('.webm'):
|
||||
return '<video src="/file/{0}" type="video/webm" controls></video>'.format(file)
|
||||
return file
|
||||
|
||||
if __name__ == '__main__':
|
||||
subprocess.run(["python3", "make_db.py"])
|
||||
#subprocess.run(["python3", "scrape_posts.py"])
|
||||
app.run(host='0.0.0.0', port=8000)
|
||||
|
|
@ -0,0 +1,32 @@
|
|||
# Scheduler configuration
|
||||
max_posts_per_pull = 100
|
||||
pull_by = "day"
|
||||
subreddits = [
|
||||
# name, minimum upvotes
|
||||
("pcgaming", 50),
|
||||
("gadgets", 10),
|
||||
("Nightreign", 100),
|
||||
("CuratedTumblr", 100),
|
||||
("196", 100),
|
||||
("PoliticalCompassMemes", 100),
|
||||
("meirl", 100),
|
||||
("me_irl", 100),
|
||||
("AITAH", 100),
|
||||
("Fauxmoi", 100),
|
||||
("NoFilterNews", 100),
|
||||
("linux", 100),
|
||||
("linux4noobs", 100),
|
||||
("selfhosted", 100),
|
||||
("HomeServer", 100),
|
||||
("homelab", 100)
|
||||
]
|
||||
max_age_days = 30
|
||||
max_age_seconds = max_age_days * 24 * 60 * 60
|
||||
|
||||
# Webpage configuration
|
||||
posts_per_page_load = 50
|
||||
|
||||
db_dir = "/reddit/db"
|
||||
media_dir = "/reddit/media"
|
||||
db_file = f"{db_dir}/data.db"
|
||||
yars_dir = "/app/YARS"
|
||||
|
|
@ -0,0 +1,39 @@
|
|||
import config
|
||||
import os
|
||||
import time
|
||||
import sqlite3
|
||||
import subprocess
|
||||
|
||||
if __name__ == "__main__":
|
||||
connection = sqlite3.connect(config.db_file)
|
||||
cursor = connection.cursor()
|
||||
now = int(time.time())
|
||||
max_created_utc = now - config.max_age_seconds
|
||||
select = "SELECT count(*) FROM post WHERE created_utc < ?"
|
||||
binds = [max_created_utc]
|
||||
results = cursor.execute(select, binds)
|
||||
print("Deleting old posts")
|
||||
delete = "DELETE FROM post WHERE created_utc < ?"
|
||||
cursor.execute(delete, binds)
|
||||
print("Deleting old media db rows")
|
||||
delete = "DELETE FROM media WHERE permalink NOT IN (SELECT permalink FROM post)"
|
||||
cursor.execute(delete)
|
||||
all_files_local = subprocess.run(["find", "/reddit/media", "-type", "f"], capture_output=True, text=True)
|
||||
all_files_local = set(all_files_local.stdout.splitlines())
|
||||
select = "SELECT local from media"
|
||||
results = cursor.execute(select).fetchall()
|
||||
connection.commit()
|
||||
connection.close()
|
||||
all_files_db = set([row[0] for row in results])
|
||||
extra_files = all_files_local - all_files_db
|
||||
print("Deleting old files")
|
||||
for file in extra_files:
|
||||
print(f"Removing {file}")
|
||||
os.remove(file)
|
||||
empty_dirs = subprocess.run(["find", "/reddit/media", "-type", "d", "-empty"], capture_output=True, text=True)
|
||||
empty_dirs = set(empty_dirs.stdout.splitlines())
|
||||
print("Deleting empty directories")
|
||||
for dir in empty_dirs:
|
||||
print(f"Removind dir {dir}")
|
||||
os.rmdir(dir)
|
||||
print("Done")
|
||||
|
|
@ -0,0 +1,12 @@
|
|||
import config
|
||||
import datetime
|
||||
import os
|
||||
import sqlite3
|
||||
|
||||
os.makedirs(config.db_dir, exist_ok=True)
|
||||
connection = sqlite3.connect(config.db_file)
|
||||
cursor = connection.cursor()
|
||||
cursor.execute("CREATE TABLE IF NOT EXISTS post(permalink primary key, subreddit, created_utc, score, media_fetched, post, hidden)")
|
||||
cursor.execute("CREATE TABLE IF NOT EXISTS media(permalink, url , local, PRIMARY KEY (permalink, url))")
|
||||
connection.commit()
|
||||
connection.close()
|
||||
|
|
@ -0,0 +1,3 @@
|
|||
flask
|
||||
requests
|
||||
pygments
|
||||
|
|
@ -0,0 +1,118 @@
|
|||
import config
|
||||
import datetime
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sqlite3
|
||||
import sys
|
||||
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
project_root = config.yars_dir
|
||||
src_path = os.path.join(project_root, "src")
|
||||
sys.path.append(src_path)
|
||||
|
||||
from yars.yars import YARS
|
||||
from yars.utils import download_image
|
||||
|
||||
# Initialize the YARS Reddit miner
|
||||
miner = YARS()
|
||||
|
||||
# Function to scrape subreddit post details and save to JSON
|
||||
def scrape_subreddit_data(subreddit, limit=5):
|
||||
ret = []
|
||||
subreddit_name = subreddit[0]
|
||||
minimum_score = subreddit[1]
|
||||
print(f"Starting {subreddit_name}")
|
||||
empty = dict()
|
||||
try:
|
||||
subreddit_posts = miner.fetch_subreddit_posts(
|
||||
subreddit_name, limit=limit, category="top", time_filter=config.pull_by
|
||||
)
|
||||
for i, post in enumerate(subreddit_posts, 1):
|
||||
score = post.get("score", 0)
|
||||
if score < minimum_score:
|
||||
continue
|
||||
post_data = {
|
||||
"permalink": post.get("permalink"),
|
||||
"title": post.get("title", ""),
|
||||
"author": post.get("author", ""),
|
||||
"created_utc": post.get("created_utc", ""),
|
||||
"num_comments": post.get("num_comments", 0),
|
||||
"score": post.get("score", 0),
|
||||
"media_urls" : post.get("media_urls", []),
|
||||
"body": post.get("body", None),
|
||||
}
|
||||
ret.append(post_data)
|
||||
print(f"Finished {subreddit_name}")
|
||||
return ret
|
||||
except Exception as e:
|
||||
print(f"Error occurred while scraping subreddit: {e}")
|
||||
return ret
|
||||
|
||||
def save_posts_to_db(data, cursor):
|
||||
if len(data)==0:
|
||||
return
|
||||
upsert = "INSERT INTO post(permalink, subreddit, created_utc, score, media_fetched, post, hidden) VALUES "
|
||||
upsert += ",".join(["(?,?,?,?,?,?,?)"] * len(data))
|
||||
upsert += " ON CONFLICT(permalink) DO UPDATE SET score=excluded.score, post=excluded.post"
|
||||
binds = []
|
||||
for post in data:
|
||||
binds.append(post["permalink"])
|
||||
m = re.search(r"\/r\/([a-zA-Z0-9_]+)\/.*", post["permalink"])
|
||||
binds.append(m.group(1)) #subreddit
|
||||
binds.append(post["created_utc"])
|
||||
binds.append(post["score"])
|
||||
binds.append(False)
|
||||
binds.append(json.dumps(post))
|
||||
binds.append(False)
|
||||
cursor.execute(upsert, binds)
|
||||
|
||||
def download_media(cursor):
|
||||
select = "SELECT post FROM post WHERE media_fetched = ?"
|
||||
binds = [False]
|
||||
results = cursor.execute(select, binds)
|
||||
post = results.fetchone()
|
||||
binds = []
|
||||
while post is not None:
|
||||
post = json.loads(post[0])
|
||||
if len(post["media_urls"])>0:
|
||||
for url in post["media_urls"]:
|
||||
binds.append(post["permalink"])
|
||||
binds.append(url)
|
||||
binds.append(download_image(url, config.media_dir))
|
||||
print("image downloaded")
|
||||
post = results.fetchone()
|
||||
|
||||
if len(binds)>0:
|
||||
upsert = "INSERT INTO media(permalink, url, local) VALUES "
|
||||
upsert += ",".join(["(?,?,?)"] * (len(binds)//3))
|
||||
upsert += " ON CONFLICT(permalink, url) DO UPDATE SET local=excluded.local"
|
||||
cursor.execute(upsert, binds)
|
||||
|
||||
update = "UPDATE post SET media_fetched = ? WHERE media_fetched = ?"
|
||||
binds = [True, False]
|
||||
cursor.execute(update, binds)
|
||||
|
||||
def download_comments_for_permalink(permalink, cursor):
|
||||
# Currently unused
|
||||
post_details = miner.scrape_post_details(permalink)
|
||||
update = "UPDATE post SET body = ? WHERE permalink = ?"
|
||||
binds = [post_details["body"], permalink]
|
||||
cursor.execute(update, binds)
|
||||
|
||||
upsert += "INSERT INTO comments(permalink, comments) VALUES (?, ?) ON CONFLICT(permalink) DO UPDATE SET comments=excluded.comments"
|
||||
binds = [permalink, post_details["comments"]]
|
||||
cursor.execute(upsert, binds)
|
||||
|
||||
# Main execution
|
||||
if __name__ == "__main__":
|
||||
os.makedirs(config.media_dir, exist_ok=True)
|
||||
connection = sqlite3.connect(config.db_file)
|
||||
cursor = connection.cursor()
|
||||
for subreddit in config.subreddits:
|
||||
post_data = scrape_subreddit_data(subreddit, config.max_posts_per_pull)
|
||||
save_posts_to_db(post_data, cursor)
|
||||
connection.commit()
|
||||
download_media(cursor)
|
||||
connection.commit()
|
||||
connection.close()
|
||||
|
|
@ -0,0 +1,342 @@
|
|||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>Reddit, but better</title>
|
||||
<style>
|
||||
:root {
|
||||
--dark: #2c2c2c;
|
||||
--darker: #171717;
|
||||
--light: #bfbfbf;
|
||||
}
|
||||
html {
|
||||
height: 100%;
|
||||
}
|
||||
body {
|
||||
background-color: var(--darker);
|
||||
color: var(--light);
|
||||
min-height: 100%;
|
||||
margin: 0; /* Removes default browser margin */
|
||||
}
|
||||
img, video {
|
||||
max-width: 100%;
|
||||
max-height: 100vh;
|
||||
width: auto;
|
||||
height: auto;
|
||||
}
|
||||
div.post {
|
||||
background-color: var(--dark);
|
||||
border: 2px solid var(--light);
|
||||
border-radius: 15px;
|
||||
padding: 10px;
|
||||
margin-bottom: 20px;
|
||||
}
|
||||
.sidebar {
|
||||
outline: 2px solid var(--light);
|
||||
position: sticky;
|
||||
top: 0;
|
||||
left: 0;
|
||||
background-color: var(--dark);
|
||||
display: flex;
|
||||
flex-wrap: nowrap;
|
||||
z-index: 1000;
|
||||
}
|
||||
.content {
|
||||
display: flex;
|
||||
flex-grow: 1; /* Takes up remaining space */
|
||||
flex-direction: column;
|
||||
}
|
||||
/* desktop */
|
||||
@media (min-aspect-ratio: 1) {
|
||||
img, video {
|
||||
max-height: 80vh;
|
||||
}
|
||||
div.post {
|
||||
width: 70vw;
|
||||
margin-left: 20px;
|
||||
}
|
||||
.container {
|
||||
display: flex;
|
||||
flex-direction: row;
|
||||
}
|
||||
.sidebar {
|
||||
width: fit-content;
|
||||
height: 100vh;
|
||||
flex-direction: column;
|
||||
overflow-y: auto;
|
||||
padding: 5px
|
||||
}
|
||||
}
|
||||
/* phone */
|
||||
@media (max-aspect-ratio: 1) {
|
||||
img, video {
|
||||
max-height: 100vh;
|
||||
}
|
||||
div.post {
|
||||
width: calc(100vw - 50px);
|
||||
margin-top: 10px;
|
||||
}
|
||||
.container {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
}
|
||||
.sidebar {
|
||||
width: 100vw;
|
||||
height: 50px;
|
||||
flex-direction: row;
|
||||
overflow-x: auto;
|
||||
align-items: center;
|
||||
padding-top: 5px;
|
||||
padding-bottom: 5px;
|
||||
}
|
||||
.content {
|
||||
align-items: center;
|
||||
}
|
||||
}
|
||||
.sidebar a {
|
||||
display: block;
|
||||
color: var(--light);
|
||||
text-decoration: none;
|
||||
white-space: nowrap;
|
||||
margin: 5px;
|
||||
padding: 5px;
|
||||
}
|
||||
.sidebar a:hover {
|
||||
background-color: var(--darker);
|
||||
color: var(--light);
|
||||
}
|
||||
.content h1 {
|
||||
margin-left: 20px;
|
||||
}
|
||||
.invert {
|
||||
filter: invert(1);
|
||||
transition: filter 0.3s;
|
||||
}
|
||||
.button-wrapper {
|
||||
display: flex;
|
||||
width: 100%;
|
||||
gap: 10px;
|
||||
margin-top: 10px;
|
||||
}
|
||||
.button-wrapper.gallery {
|
||||
gap: 5px;
|
||||
}
|
||||
.button-wrapper button {
|
||||
flex: 1;
|
||||
padding: 10px;
|
||||
cursor: pointer;
|
||||
background-color: var(--darker);
|
||||
color: var(--light);
|
||||
border: 2px solid var(--light);
|
||||
border-radius: 10px;
|
||||
font-size: 1.25rem;
|
||||
font-weight: bold;
|
||||
}
|
||||
.button-wrapper button.gallery {
|
||||
padding: 5px;
|
||||
background-color: var(--darker);
|
||||
border-radius: 5px;
|
||||
border: none;
|
||||
cursor: none;
|
||||
}
|
||||
.button-wrapper button.gallery.selected {
|
||||
background-color: var(--light);
|
||||
}
|
||||
.text-content {
|
||||
overflow: hidden;
|
||||
transition: max-height 0.3s ease-out; /* Smooth transition */
|
||||
max-height: 20vh;
|
||||
position: relative;
|
||||
}
|
||||
.text-content::after {
|
||||
content: "";
|
||||
position: absolute;
|
||||
bottom: 0;
|
||||
left: 0;
|
||||
width: 100%;
|
||||
height: 30px;
|
||||
background: linear-gradient(to bottom, rgba(255,255,255,0), var(--dark));
|
||||
}
|
||||
.text-content.expanded {
|
||||
max-height: 1000vh;
|
||||
}
|
||||
.text-content.expanded::after {
|
||||
display: none;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<div class="sidebar">
|
||||
{% for subreddit in subreddits %}
|
||||
<a href="{{ subreddit }}">{{ subreddit }}</a>
|
||||
{% endfor %}
|
||||
</div>
|
||||
<div class="content">
|
||||
<h1>{{ title }} ({{ count }})</h1>
|
||||
{% for post in posts %}
|
||||
<div class="post">
|
||||
<h2>{{ post.title }}</h2>
|
||||
<h4>Score: {{ post.score }}</h4>
|
||||
<div class="media-div">
|
||||
{% for media in post.media_html %}
|
||||
{{ media|safe }}
|
||||
{% endfor %}
|
||||
{% if post.media_html|length > 1 %}
|
||||
<span class="button-wrapper gallery">
|
||||
</span>
|
||||
{% endif %}
|
||||
</div>
|
||||
{% if post.body %}
|
||||
<div class="text-content" onclick="expand(this)">
|
||||
{{ post.body }}
|
||||
</div>
|
||||
{% endif %}
|
||||
<span class="button-wrapper">
|
||||
<button type="button" onclick='comments("{{ post.permalink }}")'>Comments</button>
|
||||
<button type="button" onclick='hide(this, "{{ post.permalink }}")'>Hide</button>
|
||||
</span>
|
||||
</div>
|
||||
{% endfor %}
|
||||
</div>
|
||||
</div>
|
||||
<script>
|
||||
// setup galleries
|
||||
mediaDivs = document.querySelectorAll('.media-div');
|
||||
|
||||
mediaDivs.forEach(div => {
|
||||
images = Array.from(div.querySelectorAll('img'));
|
||||
if (images.length > 1) {
|
||||
buttonSpan = div.querySelector('.button-wrapper:first-of-type');
|
||||
images.forEach(image => {
|
||||
image.addEventListener('click', (e) => {
|
||||
if (e.offsetX > image.offsetWidth * 2 / 3) {
|
||||
// scroll right
|
||||
div = e.target.closest('.media-div');
|
||||
images = Array.from(div.querySelectorAll('img'));
|
||||
currentIndex = images.indexOf(e.target)
|
||||
if (currentIndex < (images.length -1)) {
|
||||
buttons = Array.from(div.querySelectorAll('button'));
|
||||
images[currentIndex].style.display = "none";
|
||||
images[currentIndex+1].style.display = "block";
|
||||
buttons[currentIndex].classList.remove('selected');
|
||||
buttons[currentIndex+1].classList.add('selected');
|
||||
} else {
|
||||
e.target.classList.toggle('invert');
|
||||
}
|
||||
} else if (e.offsetX < image.offsetWidth / 3) {
|
||||
// scroll left
|
||||
div = e.target.closest('.media-div');
|
||||
images = Array.from(div.querySelectorAll('img'));
|
||||
currentIndex = images.indexOf(e.target)
|
||||
if (currentIndex > 0) {
|
||||
buttons = Array.from(div.querySelectorAll('button'));
|
||||
images[currentIndex].style.display = "none";
|
||||
images[currentIndex-1].style.display = "block";
|
||||
buttons[currentIndex].classList.remove('selected');
|
||||
buttons[currentIndex-1].classList.add('selected');
|
||||
} else {
|
||||
e.target.classList.toggle('invert');
|
||||
}
|
||||
} else {
|
||||
image.classList.toggle('invert');
|
||||
}
|
||||
});
|
||||
});
|
||||
firstImage = images.shift();
|
||||
firstButton = document.createElement('button');
|
||||
firstButton.classList.add('gallery');
|
||||
firstButton.classList.add('selected');
|
||||
buttonSpan.appendChild(firstButton);
|
||||
images.forEach(image => {
|
||||
image.style.display = "none";
|
||||
button = document.createElement('button');
|
||||
button.classList.add('gallery');
|
||||
buttonSpan.appendChild(button);
|
||||
});
|
||||
} else {
|
||||
images.forEach(image => {
|
||||
image.addEventListener('click', () => {
|
||||
image.classList.toggle('invert');
|
||||
});
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
// main button code
|
||||
|
||||
function hide(button, permalink){
|
||||
const div = button.closest('.post');
|
||||
div.scrollTo({
|
||||
top: 0,
|
||||
behavior: 'smooth'
|
||||
});
|
||||
div.remove();
|
||||
try {
|
||||
fetch('/hide' + permalink);
|
||||
} catch (error) {
|
||||
console.error('Could hide', error);
|
||||
}
|
||||
}
|
||||
|
||||
function comments(permalink){
|
||||
window.open("https://reddit.com" + permalink, '_blank');
|
||||
}
|
||||
|
||||
// text expand code
|
||||
|
||||
function checkHeight(){
|
||||
const divs = document.querySelectorAll('.text-content');
|
||||
divs.forEach(div => {
|
||||
height = div.offsetHeight;
|
||||
style = window.getComputedStyle(div);
|
||||
maxHeight = parseInt(style.maxHeight);
|
||||
if (height < maxHeight) {
|
||||
div.classList.add('expanded');
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
function expand(div) {
|
||||
div.classList.add('expanded');
|
||||
}
|
||||
|
||||
window.addEventListener('load', (event) => {
|
||||
checkHeight()
|
||||
});
|
||||
|
||||
window.addEventListener('resize', (event) => {
|
||||
checkHeight()
|
||||
});
|
||||
|
||||
// audio/video sync code
|
||||
|
||||
function findAudio(video){
|
||||
const div = video.closest('.post');
|
||||
return div.querySelector('audio:first-of-type');
|
||||
}
|
||||
|
||||
function playAudio(video){
|
||||
audio = findAudio(video);
|
||||
if (audio) {
|
||||
audio.play();
|
||||
audio.currentTime = video.currentTime;
|
||||
}
|
||||
}
|
||||
function pauseAudio(video){
|
||||
audio = findAudio(video);
|
||||
if (audio) {
|
||||
audio.pause();
|
||||
audio.currentTime = video.currentTime;
|
||||
}
|
||||
}
|
||||
function seekAudio(video){
|
||||
audio = findAudio(video);
|
||||
if (audio) {
|
||||
audio.currentTime = video.currentTime;
|
||||
}
|
||||
}
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
|
|
@ -0,0 +1,26 @@
|
|||
services:
|
||||
ofelia:
|
||||
image: mcuadros/ofelia:latest
|
||||
restart: unless-stopped
|
||||
depends_on:
|
||||
- web
|
||||
command: daemon --docker
|
||||
volumes:
|
||||
- /var/run/docker.sock:/var/run/docker.sock:ro
|
||||
web:
|
||||
build:
|
||||
context: app
|
||||
target: builder
|
||||
restart: unless-stopped
|
||||
stop_signal: SIGINT
|
||||
ports:
|
||||
- '8001:8000'
|
||||
volumes:
|
||||
- ./db:/reddit/db
|
||||
- ./media:/reddit/media
|
||||
labels:
|
||||
ofelia.enabled: "true"
|
||||
ofelia.job-exec.scrape.schedule: "@every 60m"
|
||||
ofelia.job-exec.scrape.command: "python3 /app/scrape_posts.py"
|
||||
ofelia.job-exec.clean.schedule: "@every 24h"
|
||||
ofelia.job-exec.clean.command: "python3 /app/delete_posts.py"
|
||||
Loading…
Reference in New Issue