Compare commits

...

3 Commits

Author SHA1 Message Date
Ventilaar
570ac88b99 Reimplement orphaned video listing and more accurate stats
Some checks failed
Update worker server / build-and-publish (release) Has been cancelled
Generate docker image / build-and-publish (release) Successful in 19s
2025-02-11 22:55:21 +01:00
Ventilaar
c51a72ec2b minor typo
Some checks failed
Generate docker image / build-and-publish (release) Failing after 7s
Update worker server / build-and-publish (release) Successful in 16s
2025-02-11 17:41:20 +01:00
Ventilaar
fa8f11dad6 Major update
All checks were successful
Update worker server / build-and-publish (release) Successful in 13s
Generate docker image / build-and-publish (release) Successful in 59s
2025-02-11 13:12:10 +01:00
15 changed files with 253 additions and 72 deletions

View File

@@ -52,13 +52,22 @@ Extra functionality for further development of features.
### Stage 3
Mainly focused on retiring the cronjob based scripts and moving it to celery based tasks
- [ ] manage videos by ID's instead of per channel basis
- [ ] download videos from queue
- [x] manage videos by ID's instead of per channel basis
- [x] download videos from queue
- [x] Manage websub callbacks
- [x] Implement yt-dlp proxy servers, as the VPN is blocked
- [x] Celery tasks based video downloading
- [x] Manage websub callbacks
- [x] Celery task queue views
- [x] More performant statistics
- [ ] Retire cronjobs
- [ ] Retire file based configurations
### Stage 4
Mongodb finally has it's limitations.
- [ ] Migrate to postgresql
- [ ] Retire time based tasks like channel mirroring
- [ ] A more comprehensive statistics page, uploads per day, downloads per day and such
### Stage ...
Since this is my flagship software which I have developed more features will be added.

View File

@@ -1,13 +0,0 @@
#Import os Library
import os
import datetime
import json
def print_current_time(give=False):
time = datetime.datetime.now().replace(microsecond=0)
print(f'--- It is {time} ---')
return time
with open('lockfile', 'w') as file:
data = {'time': print_current_time(), 'PID': os.getpid()}
file.write(json.dumps(data, default=str))

View File

@@ -24,9 +24,11 @@ def create_app(test_config=None):
# Celery Periodic tasks
config['CELERY']['beat_schedule'] = {}
config['CELERY']['beat_schedule']['Renew WebSub endpoints'] = {'task': 'ayta.tasks.websub_renew_expiring', 'schedule': 4000}
config['CELERY']['beat_schedule']['Process WebSub data'] = {'task': 'ayta.tasks.websub_process_data', 'schedule': 100}
config['CELERY']['beat_schedule']['Queue up new videos in static channel playlists'] = {'task': 'ayta.tasks.playlist_to_queue', 'schedule': 50000}
config['CELERY']['beat_schedule']['Renew WebSub endpoints around every hour'] = {'task': 'ayta.tasks.websub_renew_expiring', 'schedule': 4000}
config['CELERY']['beat_schedule']['Process WebSub data around every two minutes'] = {'task': 'ayta.tasks.websub_process_data', 'schedule': 100}
config['CELERY']['beat_schedule']['Queue up new videos in static channel playlists about 2 times a day'] = {'task': 'ayta.tasks.playlist_to_queue', 'schedule': 50000}
config['CELERY']['beat_schedule']['Download around 123 videos spread out through the day'] = {'task': 'ayta.tasks.video_queue', 'schedule': 700}
config['CELERY']['beat_schedule']['Generate new statistiscs about every 3 hours'] = {'task': 'ayta.tasks.generate_statistics', 'schedule': 10000}
# Celery task routing
# Tasks not defined in this configuration will be routed to the default queue "celery"
@@ -51,6 +53,7 @@ def create_app(test_config=None):
app.jinja_env.filters['current_time'] = filters.current_time
app.jinja_env.filters['epoch_time'] = filters.epoch_time
app.jinja_env.filters['epoch_date'] = filters.epoch_date
app.jinja_env.filters['datetime_date'] = filters.datetime_date
from .blueprints import watch
from .blueprints import index

View File

@@ -230,8 +230,9 @@ def queue():
endpoints = get_nosql().queue_getEndpoints()
queue = get_nosql().queue_getQueue()
count = len(list(queue.clone()))
return render_template('admin/queue.html', endpoints=endpoints, queue=queue)
return render_template('admin/queue.html', endpoints=endpoints, queue=queue, count=count)
@bp.route('/users', methods=['GET', 'POST'])
@login_required

View File

@@ -22,5 +22,4 @@ def base():
return render_template('search/index.html', results=results, query=query)
return render_template('search/index.html', stats=get_nosql().gen_stats())
return render_template('search/index.html', stats=get_nosql().statistics_get())

View File

@@ -27,6 +27,12 @@ def epoch_time(epoch):
return datetime.fromtimestamp(epoch).strftime('%d %b %Y %H:%M:%S')
except:
return None
def datetime_date(obj):
try:
return obj.strftime('%d %b %Y %H:%M')
except:
return None
def current_time(null=None, object=False):
if object:

View File

@@ -44,6 +44,7 @@ class Mango:
self.reports = self.db['reports']
self.queue_endpoints = self.db['queue_endpoints']
self.users = self.db['users']
self.statistics = self.db['statistics']
self.ensure_indexes()
@@ -55,7 +56,11 @@ class Mango:
'info_json': [
('id_1', True),
('channel_id_1', False),
('uploader_1', False)
('uploader_1', False),
('timestamp', False),
('upload_date', False),
('filesize_approx', False),
('_status', False)
],
'websub_callbacks': [
('id', True)
@@ -92,16 +97,6 @@ class Mango:
# general functions #
##########################################
def gen_stats(self):
""" Returns DICT; Channel statistics given the dict key """
stats = {}
stats['videos'] = self.info_json.count_documents({})
stats['channels'] = self.channels.count_documents({})
stats['queue'] = self.download_queue.count_documents({})
return stats
def search_videos(self, query):
# search the index for the requested query. return limited keys
results = self.info_json.find({"$text": {"$search": query}},
@@ -198,7 +193,9 @@ class Mango:
def get_orphaned_videos(self):
""" Returns a SET of YouTube video ID's which have info_jsons in the collection but no permanent channel is defined. SLOW OPERATION """
# Ok lemme explain. Perform inner join from channel collection on channel_id key. match only the fields which are empty. return video id
# The following code I have commented out because the query took too long to proccess, timing the operation out
"""# Ok lemme explain. Perform inner join from channel collection on channel_id key. match only the fields which are empty. return video id
pipeline = [{'$match': {'_status': 'available'}},
{'$lookup': {'from': 'channels', 'localField': 'channel_id', 'foreignField': 'id', 'as': 'channel'}},
{'$match': {'channel': {'$size': 0}}},{'$project': {'id': 1}},
@@ -207,7 +204,20 @@ class Mango:
results = self.info_json.aggregate(pipeline)
ids = [result['id'] for result in results]
return tuple(ids)
return tuple(ids)"""
# Reimplementing the query but in python, as I do not care about memory usage or data transfer
channels = self.channels.find({}, {'_id': 0, 'id': 1})
videos = self.info_json.find({'_status': 'available'}, {'_id': 0, 'channel_id': 1, 'id': 1})
channels = set([x['id'] for x in channels])
orphaned = []
for item in videos:
if item['channel_id'] not in channels:
orphaned.append(item['id'])
return tuple(orphaned)
def get_recent_videos(self, count=99):
""" Returns a SET of YouTube video ID's which have been added last to the info_json collection """
@@ -455,6 +465,44 @@ class Mango:
continue
self.download_queue.update_one({'id': queueItem['id']}, {'$set': {'status': 'working'}})
return queueItem
##########################################
# STATISTICS FUNCTIONS #
##########################################
def statistics_counts(self):
counts = {}
counts['videos'] = f"{self.info_json.count_documents({'_status': 'available'})} videos in the archive"
counts['channels'] = f'{self.channels.count_documents({})} channels in the system'
counts['download_queue'] = f"{self.download_queue.count_documents({'status': 'queued'})} queued videos for download"
return counts
def statistics_sizes(self):
sizes = {}
data = self.info_json.find({'_status': 'available'}, {'filesize_approx': 1})
total = 0
for x in data:
size = x.get('filesize_approx')
if size:
total = total + int(size)
sizes['storage'] = f'{int(total / 1000000000000 + 5)} TB of storage' # the 5 is the amount of TB of unjust approximation (updated feb 2025)
return sizes
def statistics_generate(self):
data = self.statistics_sizes() | self.statistics_counts()
data['last_updated'] = self.datetime.utcnow()
self.statistics.delete_many({}) # drop existing documents
self.statistics.insert_one(data)
def statistics_get(self):
return self.statistics.find_one({}, {'_id': 0})
##########################################
# HELPER FUNCTIONS #

View File

@@ -168,6 +168,12 @@ def websub_renew_expiring(hours=6):
@shared_task()
def playlist_to_queue():
"""
As there is still one cronjob based task running daily in the background, we have to make sure that gets hooked as well into the system.
The cronjob task gets the last 50 uploads for all channels and commits the playlist json into the database
This task makes sure we append the ID's that we got from the playlist into the download queue.
Should idealy be run after the cronjob completes, but I don't want to implement an API that does that, so this gets run twice a day.
"""
from .nosql import get_nosql
import random
from datetime import datetime, timedelta
@@ -187,6 +193,11 @@ def playlist_to_queue():
for item in info['playlist']['entries']:
videoId = item['id']
get_nosql().queue_insertQueue(videoId, 'Playlist mirroring')
@shared_task()
def generate_statistics():
from .nosql import get_nosql
get_nosql().statistics_generate()
##########################################
# TASK MODULES #

View File

@@ -126,10 +126,13 @@
</div>
<div class="divider"></div>
<div class="row">
<div class="col s6 l9">
<div class="col s4 l8">
<h5>Queued ID's</h5>
</div>
<div class="col s6 l3 m-4 input-field">
<div class="col s4 l1">
<p>{{ count }} items</p>
</div>
<div class="col s4 l3 m-4 input-field">
<input id="filter_query" type="text">
<label for="filter_query">Filter results</label>
</div>

View File

@@ -5,68 +5,72 @@
{% block content %}
<div class="row">
<div class="col s12 l3 m-4">
<h4>Search the archive</h4>
<p>Searching is currently partially working and will probably not work optimally for a long time until the database and backend is fully reworked.</p>
<p>In the meantime if you know the channel name and video title you can use local search on <a href="{{ url_for('channel.base') }}">this</a> page</p>
<img class="responsive-img" src="{{ url_for('static', filename='img/mongo_meme.png') }}">
{% if stats is defined %}
<div class="divider"></div>
<h5>Stats of the archive</h5>
<h4>Search the archive</h4>
<p>Searching is currently partially working and will probably not work optimally for a long time until the database and backend is fully reworked.</p>
<p>In the meantime if you know the channel name and video title you can use local search on <a href="{{ url_for('channel.base') }}">this</a> page</p>
<img class="responsive-img" src="{{ url_for('static', filename='img/mongo_meme.png') }}">
{% if stats is not none and stats is defined %}
<div class="divider"></div>
<h5>Stats of the archive</h5>
<ul class="collection">
{% for stat in stats %}
{% for stat in stats %}
<li class="collection-item">
<span class="title">{{ stat }}</span>
<p>{{ stats[stat] }}</p>
</li>
{% endfor %}
<!--<span class="title">{{ stat }}</span>-->
{% if stat == 'last_updated' %}
Last updated {{ stats[stat]|datetime_date }} UTC
{% else %}
{{ stats[stat] }}
{% endif %}
</li>
{% endfor %}
</ul>
{% endif %}
{% endif %}
</div>
<div class="col s12 l9 m-4">
<div class="row">
<div class="col s6 offset-s3">
<div class="col s6 offset-s3">
<img class="responsive-img" src="{{ url_for('static', filename='img/bing_chilling.png') }}">
</div>
</div>
<div class="col s12 center-align">
<h5>"A big archive needs a search function." -Sun Tzu</h5>
</div>
</div>
<div class="divider"></div>
<form method="post" class="">
<div class="row">
<div class="col s12 m-4 input-field">
<input id="first_name" name="query" type="text" placeholder='Search the archive!' maxlength="64" value="{{ query }}">
</div>
<div class="divider"></div>
<form method="post" class="">
<div class="row">
<div class="col s12 m-4 input-field">
<input id="first_name" name="query" type="text" placeholder='Search the archive!' maxlength="64" value="{{ query }}">
<label for="first_name">Searching in video titles, uploader names and tags.</label>
<span class="supporting-text">Input will be interpreted as keywords. You can search for literal text by using quotes("). Or exclude by prepending minus (-).</span>
</div>
<div class="col s12 m-4">
</div>
<div class="col s12 m-4">
<button class="btn icon-right waves-effect waves-light" type="submit" name="task" value="search">Search</button>
</div>
</div>
</form>
{% if results is defined %}
<div class="divider"></div>
</div>
</div>
</form>
{% if results is defined %}
<div class="divider"></div>
<table class="striped highlight responsive-table">
<thead>
<tr>
<th>Title</th>
<th>Uploader</th>
<th>Date</th>
<th>Date</th>
</tr>
</thead>
<tbody>
{% for result in results %}
{% for result in results %}
<tr>
<td><a href="{{ url_for('watch.base') }}?v={{ result.get('id') }}">{{ result.get('title') }}</a></td>
<td><a href="{{ url_for('channel.channel', channelId=result.get('channel_id')) }}">{{ result.get('uploader') }}</a></td>
<td>{{ result.get('upload_date')|pretty_time }}</td>
<td>{{ result.get('upload_date')|pretty_time }}</td>
</tr>
{% endfor %}
{% endfor %}
</tbody>
</table>
{% if results|length == 0 %}<h6>No results. Relax the search terms more please!</h6>{% else %}<p>Not the results you were looking for? Try adding quotes ("") around important words.</p>{% endif %}
{% endif %}
{% if results|length == 0 %}<h6>No results. Relax the search terms more please!</h6>{% else %}<p>Not the results you were looking for? Try adding quotes ("") around important words.</p>{% endif %}
{% endif %}
</div>
</div>
{% endblock %}

View File

@@ -6,7 +6,7 @@
<meta property="og:title" content="{{ render.get('info').get('title') }}" />
<meta property="og:type" content="website" />
<meta property="og:url" content="{{ url_for('watch.base') }}?v={{ render.get('info').get('id') }}" />
<meta property="og:image" content="https://archive.ventilaar.net/videos/automatic/{{ render.get('info').get('channel_id') }}/{{ render.get('info').get('id') }}/{{ render.get('info').get('title') }}.jpg" />
<meta property="og:image" content="https://archive.ventilaar.net/videos/automatic/{{ render.get('info').get('channel_id') }}/{{ render.get('info').get('id') }}/{{ render.get('info').get('_title_slug') }}.jpg" />
<meta property="og:description" content="{{ render.get('info').get('description', '')|truncate(100) }}" />
{% endblock %}

View File

@@ -0,0 +1,20 @@
from ayta.nosql import Mango
#import ayta
#app = ayta.create_app()
mango = Mango('mongodb://root:example@192.168.66.140:27017')
data = mango.download_queue.find({'status': 'failed'})
for x in data:
vId = x['id']
lines = x['fail_reason'].splitlines()
error = lines[-1]
check = "This video has been removed for violating YouTube's Terms of Service"
if check in error:
print(vId)
mango.info_json.insert_one({'id': vId, '_status': 'unavailable',
'_status_description': f'Video is unavailable because YouTube said: {check}'})
mango.queue_deleteQueue(vId)
else:
print(error)
print('done')

18
one_offs/archive_size.py Normal file
View File

@@ -0,0 +1,18 @@
from ayta.nosql import Mango
#import ayta
#app = ayta.create_app()
mango = Mango('mongodb://root:example@192.168.66.140:27017')
data = mango.info_json.find({'_status': 'available'}, {'filesize_approx': 1})
total = 0
for x in data:
size = x.get('filesize_approx')
if size:
total = total + int(size)
# the 5000 is the amount of GB of unjust approximation
total = int(total / 1000000000 + 5000)
print(f'Approximate size: {total} GB')

View File

@@ -0,0 +1,37 @@
from ayta.nosql import Mango
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
#import ayta
#app = ayta.create_app()
mango = Mango('mongodb://root:example@192.168.66.140:27017')
pivot = datetime.utcnow() - timedelta(days=90)
pivot = int(pivot.timestamp())
data = mango.info_json.find({'_status': 'available', 'timestamp': {'$gt': pivot}}, {'epoch': 1})
stat = {}
for x in data:
epoch = x['epoch']
day = datetime.fromtimestamp(epoch).strftime('%Y%m%d')
if day not in stat:
stat[day] = 1
else:
stat[day] = stat[day] + 1
dates = list(stat.keys())
values = list(stat.values())
plt.figure(figsize=(16, 8)) # Set the figure size
plt.bar(dates, values) # Create the bar chart
# Customize the x-axis labels to be vertical
plt.xticks(rotation=45, ha='right') # Rotate xticklabels by 45 degrees and align them to the right
plt.xlabel('Date') # Label for x-axis
plt.ylabel('Counts') # Label for y-axis
plt.title('Bar Graph of Counts by Date') # Title of the graph
# Display the graph
plt.show()

View File

@@ -0,0 +1,35 @@
from ayta.nosql import Mango
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
#import ayta
#app = ayta.create_app()
mango = Mango('mongodb://root:example@192.168.66.140:27017')
pivot = '20220101'
data = mango.info_json.find({'_status': 'available', 'upload_date': {'$gt': pivot}}, {'upload_date': 1})
stat = {}
for x in data:
day = x['upload_date']
if day not in stat:
stat[day] = 1
else:
stat[day] = stat[day] + 1
dates = list(stat.keys())
values = list(stat.values())
plt.figure(figsize=(16, 8)) # Set the figure size
plt.bar(dates, values) # Create the bar chart
# Customize the x-axis labels to be vertical
plt.xticks(rotation=45, ha='right') # Rotate xticklabels by 45 degrees and align them to the right
plt.xlabel('Date') # Label for x-axis
plt.ylabel('Counts') # Label for y-axis
plt.title('Bar Graph of Counts by Date') # Title of the graph
# Display the graph
plt.show()