You've already forked amazing-ytdlp-archive
Compare commits
3 Commits
Author | SHA1 | Date | |
---|---|---|---|
![]() |
570ac88b99 | ||
![]() |
c51a72ec2b | ||
![]() |
fa8f11dad6 |
13
README.md
13
README.md
@@ -52,13 +52,22 @@ Extra functionality for further development of features.
|
||||
|
||||
### Stage 3
|
||||
Mainly focused on retiring the cronjob based scripts and moving it to celery based tasks
|
||||
- [ ] manage videos by ID's instead of per channel basis
|
||||
- [ ] download videos from queue
|
||||
- [x] manage videos by ID's instead of per channel basis
|
||||
- [x] download videos from queue
|
||||
- [x] Manage websub callbacks
|
||||
- [x] Implement yt-dlp proxy servers, as the VPN is blocked
|
||||
- [x] Celery tasks based video downloading
|
||||
- [x] Manage websub callbacks
|
||||
- [x] Celery task queue views
|
||||
- [x] More performant statistics
|
||||
- [ ] Retire cronjobs
|
||||
- [ ] Retire file based configurations
|
||||
|
||||
### Stage 4
|
||||
Mongodb finally has it's limitations.
|
||||
- [ ] Migrate to postgresql
|
||||
- [ ] Retire time based tasks like channel mirroring
|
||||
- [ ] A more comprehensive statistics page, uploads per day, downloads per day and such
|
||||
|
||||
### Stage ...
|
||||
Since this is my flagship software which I have developed more features will be added.
|
||||
|
@@ -1,13 +0,0 @@
|
||||
#Import os Library
|
||||
import os
|
||||
import datetime
|
||||
import json
|
||||
|
||||
def print_current_time(give=False):
|
||||
time = datetime.datetime.now().replace(microsecond=0)
|
||||
print(f'--- It is {time} ---')
|
||||
return time
|
||||
|
||||
with open('lockfile', 'w') as file:
|
||||
data = {'time': print_current_time(), 'PID': os.getpid()}
|
||||
file.write(json.dumps(data, default=str))
|
@@ -24,9 +24,11 @@ def create_app(test_config=None):
|
||||
# Celery Periodic tasks
|
||||
|
||||
config['CELERY']['beat_schedule'] = {}
|
||||
config['CELERY']['beat_schedule']['Renew WebSub endpoints'] = {'task': 'ayta.tasks.websub_renew_expiring', 'schedule': 4000}
|
||||
config['CELERY']['beat_schedule']['Process WebSub data'] = {'task': 'ayta.tasks.websub_process_data', 'schedule': 100}
|
||||
config['CELERY']['beat_schedule']['Queue up new videos in static channel playlists'] = {'task': 'ayta.tasks.playlist_to_queue', 'schedule': 50000}
|
||||
config['CELERY']['beat_schedule']['Renew WebSub endpoints around every hour'] = {'task': 'ayta.tasks.websub_renew_expiring', 'schedule': 4000}
|
||||
config['CELERY']['beat_schedule']['Process WebSub data around every two minutes'] = {'task': 'ayta.tasks.websub_process_data', 'schedule': 100}
|
||||
config['CELERY']['beat_schedule']['Queue up new videos in static channel playlists about 2 times a day'] = {'task': 'ayta.tasks.playlist_to_queue', 'schedule': 50000}
|
||||
config['CELERY']['beat_schedule']['Download around 123 videos spread out through the day'] = {'task': 'ayta.tasks.video_queue', 'schedule': 700}
|
||||
config['CELERY']['beat_schedule']['Generate new statistiscs about every 3 hours'] = {'task': 'ayta.tasks.generate_statistics', 'schedule': 10000}
|
||||
|
||||
# Celery task routing
|
||||
# Tasks not defined in this configuration will be routed to the default queue "celery"
|
||||
@@ -51,6 +53,7 @@ def create_app(test_config=None):
|
||||
app.jinja_env.filters['current_time'] = filters.current_time
|
||||
app.jinja_env.filters['epoch_time'] = filters.epoch_time
|
||||
app.jinja_env.filters['epoch_date'] = filters.epoch_date
|
||||
app.jinja_env.filters['datetime_date'] = filters.datetime_date
|
||||
|
||||
from .blueprints import watch
|
||||
from .blueprints import index
|
||||
|
@@ -230,8 +230,9 @@ def queue():
|
||||
|
||||
endpoints = get_nosql().queue_getEndpoints()
|
||||
queue = get_nosql().queue_getQueue()
|
||||
count = len(list(queue.clone()))
|
||||
|
||||
return render_template('admin/queue.html', endpoints=endpoints, queue=queue)
|
||||
return render_template('admin/queue.html', endpoints=endpoints, queue=queue, count=count)
|
||||
|
||||
@bp.route('/users', methods=['GET', 'POST'])
|
||||
@login_required
|
||||
|
@@ -22,5 +22,4 @@ def base():
|
||||
|
||||
return render_template('search/index.html', results=results, query=query)
|
||||
|
||||
|
||||
return render_template('search/index.html', stats=get_nosql().gen_stats())
|
||||
return render_template('search/index.html', stats=get_nosql().statistics_get())
|
@@ -27,6 +27,12 @@ def epoch_time(epoch):
|
||||
return datetime.fromtimestamp(epoch).strftime('%d %b %Y %H:%M:%S')
|
||||
except:
|
||||
return None
|
||||
|
||||
def datetime_date(obj):
|
||||
try:
|
||||
return obj.strftime('%d %b %Y %H:%M')
|
||||
except:
|
||||
return None
|
||||
|
||||
def current_time(null=None, object=False):
|
||||
if object:
|
||||
|
@@ -44,6 +44,7 @@ class Mango:
|
||||
self.reports = self.db['reports']
|
||||
self.queue_endpoints = self.db['queue_endpoints']
|
||||
self.users = self.db['users']
|
||||
self.statistics = self.db['statistics']
|
||||
|
||||
self.ensure_indexes()
|
||||
|
||||
@@ -55,7 +56,11 @@ class Mango:
|
||||
'info_json': [
|
||||
('id_1', True),
|
||||
('channel_id_1', False),
|
||||
('uploader_1', False)
|
||||
('uploader_1', False),
|
||||
('timestamp', False),
|
||||
('upload_date', False),
|
||||
('filesize_approx', False),
|
||||
('_status', False)
|
||||
],
|
||||
'websub_callbacks': [
|
||||
('id', True)
|
||||
@@ -92,16 +97,6 @@ class Mango:
|
||||
# general functions #
|
||||
##########################################
|
||||
|
||||
def gen_stats(self):
|
||||
""" Returns DICT; Channel statistics given the dict key """
|
||||
stats = {}
|
||||
|
||||
stats['videos'] = self.info_json.count_documents({})
|
||||
stats['channels'] = self.channels.count_documents({})
|
||||
stats['queue'] = self.download_queue.count_documents({})
|
||||
|
||||
return stats
|
||||
|
||||
def search_videos(self, query):
|
||||
# search the index for the requested query. return limited keys
|
||||
results = self.info_json.find({"$text": {"$search": query}},
|
||||
@@ -198,7 +193,9 @@ class Mango:
|
||||
|
||||
def get_orphaned_videos(self):
|
||||
""" Returns a SET of YouTube video ID's which have info_jsons in the collection but no permanent channel is defined. SLOW OPERATION """
|
||||
# Ok lemme explain. Perform inner join from channel collection on channel_id key. match only the fields which are empty. return video id
|
||||
|
||||
# The following code I have commented out because the query took too long to proccess, timing the operation out
|
||||
"""# Ok lemme explain. Perform inner join from channel collection on channel_id key. match only the fields which are empty. return video id
|
||||
pipeline = [{'$match': {'_status': 'available'}},
|
||||
{'$lookup': {'from': 'channels', 'localField': 'channel_id', 'foreignField': 'id', 'as': 'channel'}},
|
||||
{'$match': {'channel': {'$size': 0}}},{'$project': {'id': 1}},
|
||||
@@ -207,7 +204,20 @@ class Mango:
|
||||
results = self.info_json.aggregate(pipeline)
|
||||
ids = [result['id'] for result in results]
|
||||
|
||||
return tuple(ids)
|
||||
return tuple(ids)"""
|
||||
|
||||
# Reimplementing the query but in python, as I do not care about memory usage or data transfer
|
||||
channels = self.channels.find({}, {'_id': 0, 'id': 1})
|
||||
videos = self.info_json.find({'_status': 'available'}, {'_id': 0, 'channel_id': 1, 'id': 1})
|
||||
|
||||
channels = set([x['id'] for x in channels])
|
||||
orphaned = []
|
||||
|
||||
for item in videos:
|
||||
if item['channel_id'] not in channels:
|
||||
orphaned.append(item['id'])
|
||||
|
||||
return tuple(orphaned)
|
||||
|
||||
def get_recent_videos(self, count=99):
|
||||
""" Returns a SET of YouTube video ID's which have been added last to the info_json collection """
|
||||
@@ -455,6 +465,44 @@ class Mango:
|
||||
continue
|
||||
self.download_queue.update_one({'id': queueItem['id']}, {'$set': {'status': 'working'}})
|
||||
return queueItem
|
||||
|
||||
##########################################
|
||||
# STATISTICS FUNCTIONS #
|
||||
##########################################
|
||||
|
||||
def statistics_counts(self):
|
||||
counts = {}
|
||||
|
||||
counts['videos'] = f"{self.info_json.count_documents({'_status': 'available'})} videos in the archive"
|
||||
counts['channels'] = f'{self.channels.count_documents({})} channels in the system'
|
||||
counts['download_queue'] = f"{self.download_queue.count_documents({'status': 'queued'})} queued videos for download"
|
||||
|
||||
return counts
|
||||
|
||||
def statistics_sizes(self):
|
||||
sizes = {}
|
||||
data = self.info_json.find({'_status': 'available'}, {'filesize_approx': 1})
|
||||
|
||||
total = 0
|
||||
|
||||
for x in data:
|
||||
size = x.get('filesize_approx')
|
||||
if size:
|
||||
total = total + int(size)
|
||||
|
||||
sizes['storage'] = f'{int(total / 1000000000000 + 5)} TB of storage' # the 5 is the amount of TB of unjust approximation (updated feb 2025)
|
||||
|
||||
return sizes
|
||||
|
||||
def statistics_generate(self):
|
||||
data = self.statistics_sizes() | self.statistics_counts()
|
||||
data['last_updated'] = self.datetime.utcnow()
|
||||
|
||||
self.statistics.delete_many({}) # drop existing documents
|
||||
self.statistics.insert_one(data)
|
||||
|
||||
def statistics_get(self):
|
||||
return self.statistics.find_one({}, {'_id': 0})
|
||||
|
||||
##########################################
|
||||
# HELPER FUNCTIONS #
|
||||
|
@@ -168,6 +168,12 @@ def websub_renew_expiring(hours=6):
|
||||
|
||||
@shared_task()
|
||||
def playlist_to_queue():
|
||||
"""
|
||||
As there is still one cronjob based task running daily in the background, we have to make sure that gets hooked as well into the system.
|
||||
The cronjob task gets the last 50 uploads for all channels and commits the playlist json into the database
|
||||
This task makes sure we append the ID's that we got from the playlist into the download queue.
|
||||
Should idealy be run after the cronjob completes, but I don't want to implement an API that does that, so this gets run twice a day.
|
||||
"""
|
||||
from .nosql import get_nosql
|
||||
import random
|
||||
from datetime import datetime, timedelta
|
||||
@@ -187,6 +193,11 @@ def playlist_to_queue():
|
||||
for item in info['playlist']['entries']:
|
||||
videoId = item['id']
|
||||
get_nosql().queue_insertQueue(videoId, 'Playlist mirroring')
|
||||
|
||||
@shared_task()
|
||||
def generate_statistics():
|
||||
from .nosql import get_nosql
|
||||
get_nosql().statistics_generate()
|
||||
|
||||
##########################################
|
||||
# TASK MODULES #
|
||||
|
@@ -126,10 +126,13 @@
|
||||
</div>
|
||||
<div class="divider"></div>
|
||||
<div class="row">
|
||||
<div class="col s6 l9">
|
||||
<div class="col s4 l8">
|
||||
<h5>Queued ID's</h5>
|
||||
</div>
|
||||
<div class="col s6 l3 m-4 input-field">
|
||||
<div class="col s4 l1">
|
||||
<p>{{ count }} items</p>
|
||||
</div>
|
||||
<div class="col s4 l3 m-4 input-field">
|
||||
<input id="filter_query" type="text">
|
||||
<label for="filter_query">Filter results</label>
|
||||
</div>
|
||||
|
@@ -5,68 +5,72 @@
|
||||
{% block content %}
|
||||
<div class="row">
|
||||
<div class="col s12 l3 m-4">
|
||||
<h4>Search the archive</h4>
|
||||
<p>Searching is currently partially working and will probably not work optimally for a long time until the database and backend is fully reworked.</p>
|
||||
<p>In the meantime if you know the channel name and video title you can use local search on <a href="{{ url_for('channel.base') }}">this</a> page</p>
|
||||
<img class="responsive-img" src="{{ url_for('static', filename='img/mongo_meme.png') }}">
|
||||
{% if stats is defined %}
|
||||
<div class="divider"></div>
|
||||
<h5>Stats of the archive</h5>
|
||||
<h4>Search the archive</h4>
|
||||
<p>Searching is currently partially working and will probably not work optimally for a long time until the database and backend is fully reworked.</p>
|
||||
<p>In the meantime if you know the channel name and video title you can use local search on <a href="{{ url_for('channel.base') }}">this</a> page</p>
|
||||
<img class="responsive-img" src="{{ url_for('static', filename='img/mongo_meme.png') }}">
|
||||
{% if stats is not none and stats is defined %}
|
||||
<div class="divider"></div>
|
||||
<h5>Stats of the archive</h5>
|
||||
<ul class="collection">
|
||||
{% for stat in stats %}
|
||||
{% for stat in stats %}
|
||||
<li class="collection-item">
|
||||
<span class="title">{{ stat }}</span>
|
||||
<p>{{ stats[stat] }}</p>
|
||||
</li>
|
||||
{% endfor %}
|
||||
<!--<span class="title">{{ stat }}</span>-->
|
||||
{% if stat == 'last_updated' %}
|
||||
Last updated {{ stats[stat]|datetime_date }} UTC
|
||||
{% else %}
|
||||
{{ stats[stat] }}
|
||||
{% endif %}
|
||||
</li>
|
||||
{% endfor %}
|
||||
</ul>
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
</div>
|
||||
<div class="col s12 l9 m-4">
|
||||
<div class="row">
|
||||
<div class="col s6 offset-s3">
|
||||
<div class="col s6 offset-s3">
|
||||
<img class="responsive-img" src="{{ url_for('static', filename='img/bing_chilling.png') }}">
|
||||
</div>
|
||||
</div>
|
||||
<div class="col s12 center-align">
|
||||
<h5>"A big archive needs a search function." -Sun Tzu</h5>
|
||||
</div>
|
||||
</div>
|
||||
<div class="divider"></div>
|
||||
<form method="post" class="">
|
||||
<div class="row">
|
||||
<div class="col s12 m-4 input-field">
|
||||
<input id="first_name" name="query" type="text" placeholder='Search the archive!' maxlength="64" value="{{ query }}">
|
||||
</div>
|
||||
<div class="divider"></div>
|
||||
<form method="post" class="">
|
||||
<div class="row">
|
||||
<div class="col s12 m-4 input-field">
|
||||
<input id="first_name" name="query" type="text" placeholder='Search the archive!' maxlength="64" value="{{ query }}">
|
||||
<label for="first_name">Searching in video titles, uploader names and tags.</label>
|
||||
<span class="supporting-text">Input will be interpreted as keywords. You can search for literal text by using quotes("). Or exclude by prepending minus (-).</span>
|
||||
</div>
|
||||
<div class="col s12 m-4">
|
||||
</div>
|
||||
<div class="col s12 m-4">
|
||||
<button class="btn icon-right waves-effect waves-light" type="submit" name="task" value="search">Search</button>
|
||||
</div>
|
||||
</div>
|
||||
</form>
|
||||
{% if results is defined %}
|
||||
<div class="divider"></div>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
</form>
|
||||
{% if results is defined %}
|
||||
<div class="divider"></div>
|
||||
|
||||
<table class="striped highlight responsive-table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th>Title</th>
|
||||
<th>Uploader</th>
|
||||
<th>Date</th>
|
||||
<th>Date</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
{% for result in results %}
|
||||
{% for result in results %}
|
||||
<tr>
|
||||
<td><a href="{{ url_for('watch.base') }}?v={{ result.get('id') }}">{{ result.get('title') }}</a></td>
|
||||
<td><a href="{{ url_for('channel.channel', channelId=result.get('channel_id')) }}">{{ result.get('uploader') }}</a></td>
|
||||
<td>{{ result.get('upload_date')|pretty_time }}</td>
|
||||
<td>{{ result.get('upload_date')|pretty_time }}</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
{% endfor %}
|
||||
</tbody>
|
||||
</table>
|
||||
{% if results|length == 0 %}<h6>No results. Relax the search terms more please!</h6>{% else %}<p>Not the results you were looking for? Try adding quotes ("") around important words.</p>{% endif %}
|
||||
{% endif %}
|
||||
{% if results|length == 0 %}<h6>No results. Relax the search terms more please!</h6>{% else %}<p>Not the results you were looking for? Try adding quotes ("") around important words.</p>{% endif %}
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
{% endblock %}
|
@@ -6,7 +6,7 @@
|
||||
<meta property="og:title" content="{{ render.get('info').get('title') }}" />
|
||||
<meta property="og:type" content="website" />
|
||||
<meta property="og:url" content="{{ url_for('watch.base') }}?v={{ render.get('info').get('id') }}" />
|
||||
<meta property="og:image" content="https://archive.ventilaar.net/videos/automatic/{{ render.get('info').get('channel_id') }}/{{ render.get('info').get('id') }}/{{ render.get('info').get('title') }}.jpg" />
|
||||
<meta property="og:image" content="https://archive.ventilaar.net/videos/automatic/{{ render.get('info').get('channel_id') }}/{{ render.get('info').get('id') }}/{{ render.get('info').get('_title_slug') }}.jpg" />
|
||||
<meta property="og:description" content="{{ render.get('info').get('description', '')|truncate(100) }}" />
|
||||
{% endblock %}
|
||||
|
||||
|
20
one_offs/add_failed_queue_to_unavailable.py
Normal file
20
one_offs/add_failed_queue_to_unavailable.py
Normal file
@@ -0,0 +1,20 @@
|
||||
from ayta.nosql import Mango
|
||||
#import ayta
|
||||
#app = ayta.create_app()
|
||||
mango = Mango('mongodb://root:example@192.168.66.140:27017')
|
||||
|
||||
data = mango.download_queue.find({'status': 'failed'})
|
||||
for x in data:
|
||||
vId = x['id']
|
||||
lines = x['fail_reason'].splitlines()
|
||||
error = lines[-1]
|
||||
check = "This video has been removed for violating YouTube's Terms of Service"
|
||||
|
||||
if check in error:
|
||||
print(vId)
|
||||
mango.info_json.insert_one({'id': vId, '_status': 'unavailable',
|
||||
'_status_description': f'Video is unavailable because YouTube said: {check}'})
|
||||
mango.queue_deleteQueue(vId)
|
||||
else:
|
||||
print(error)
|
||||
print('done')
|
18
one_offs/archive_size.py
Normal file
18
one_offs/archive_size.py
Normal file
@@ -0,0 +1,18 @@
|
||||
from ayta.nosql import Mango
|
||||
#import ayta
|
||||
#app = ayta.create_app()
|
||||
mango = Mango('mongodb://root:example@192.168.66.140:27017')
|
||||
|
||||
data = mango.info_json.find({'_status': 'available'}, {'filesize_approx': 1})
|
||||
|
||||
total = 0
|
||||
|
||||
for x in data:
|
||||
size = x.get('filesize_approx')
|
||||
if size:
|
||||
total = total + int(size)
|
||||
|
||||
# the 5000 is the amount of GB of unjust approximation
|
||||
total = int(total / 1000000000 + 5000)
|
||||
|
||||
print(f'Approximate size: {total} GB')
|
37
one_offs/stats_downloads_per_day.py
Normal file
37
one_offs/stats_downloads_per_day.py
Normal file
@@ -0,0 +1,37 @@
|
||||
from ayta.nosql import Mango
|
||||
import matplotlib.pyplot as plt
|
||||
from datetime import datetime, timedelta
|
||||
#import ayta
|
||||
#app = ayta.create_app()
|
||||
mango = Mango('mongodb://root:example@192.168.66.140:27017')
|
||||
|
||||
pivot = datetime.utcnow() - timedelta(days=90)
|
||||
pivot = int(pivot.timestamp())
|
||||
|
||||
data = mango.info_json.find({'_status': 'available', 'timestamp': {'$gt': pivot}}, {'epoch': 1})
|
||||
|
||||
stat = {}
|
||||
|
||||
for x in data:
|
||||
epoch = x['epoch']
|
||||
day = datetime.fromtimestamp(epoch).strftime('%Y%m%d')
|
||||
|
||||
if day not in stat:
|
||||
stat[day] = 1
|
||||
else:
|
||||
stat[day] = stat[day] + 1
|
||||
|
||||
dates = list(stat.keys())
|
||||
values = list(stat.values())
|
||||
|
||||
plt.figure(figsize=(16, 8)) # Set the figure size
|
||||
plt.bar(dates, values) # Create the bar chart
|
||||
|
||||
# Customize the x-axis labels to be vertical
|
||||
plt.xticks(rotation=45, ha='right') # Rotate xticklabels by 45 degrees and align them to the right
|
||||
plt.xlabel('Date') # Label for x-axis
|
||||
plt.ylabel('Counts') # Label for y-axis
|
||||
plt.title('Bar Graph of Counts by Date') # Title of the graph
|
||||
|
||||
# Display the graph
|
||||
plt.show()
|
35
one_offs/stats_uploads_per_day.py
Normal file
35
one_offs/stats_uploads_per_day.py
Normal file
@@ -0,0 +1,35 @@
|
||||
from ayta.nosql import Mango
|
||||
import matplotlib.pyplot as plt
|
||||
from datetime import datetime, timedelta
|
||||
#import ayta
|
||||
#app = ayta.create_app()
|
||||
mango = Mango('mongodb://root:example@192.168.66.140:27017')
|
||||
|
||||
pivot = '20220101'
|
||||
|
||||
data = mango.info_json.find({'_status': 'available', 'upload_date': {'$gt': pivot}}, {'upload_date': 1})
|
||||
|
||||
stat = {}
|
||||
|
||||
for x in data:
|
||||
day = x['upload_date']
|
||||
|
||||
if day not in stat:
|
||||
stat[day] = 1
|
||||
else:
|
||||
stat[day] = stat[day] + 1
|
||||
|
||||
dates = list(stat.keys())
|
||||
values = list(stat.values())
|
||||
|
||||
plt.figure(figsize=(16, 8)) # Set the figure size
|
||||
plt.bar(dates, values) # Create the bar chart
|
||||
|
||||
# Customize the x-axis labels to be vertical
|
||||
plt.xticks(rotation=45, ha='right') # Rotate xticklabels by 45 degrees and align them to the right
|
||||
plt.xlabel('Date') # Label for x-axis
|
||||
plt.ylabel('Counts') # Label for y-axis
|
||||
plt.title('Bar Graph of Counts by Date') # Title of the graph
|
||||
|
||||
# Display the graph
|
||||
plt.show()
|
Reference in New Issue
Block a user