Reimplement orphaned video listing and more accurate stats

minor typo
Major update
2025-02-11 22:55:21 +01:00 · 2025-02-11 17:41:20 +01:00 · 2025-02-11 13:12:10 +01:00
15 changed files with 253 additions and 72 deletions
--- a/README.md
+++ b/README.md
@@ -52,13 +52,22 @@ Extra functionality for further development of features.

 ### Stage 3
 Mainly focused on retiring the cronjob based scripts and moving it to celery based tasks
- [ ] manage videos by ID's instead of per channel basis
- [ ] download videos from queue
+- [x] manage videos by ID's instead of per channel basis
+- [x] download videos from queue
 - [x] Manage websub callbacks
+- [x] Implement yt-dlp proxy servers, as the VPN is blocked
+- [x] Celery tasks based video downloading
+- [x] Manage websub callbacks
+- [x] Celery task queue views
+- [x] More performant statistics
+- [ ] Retire cronjobs
+- [ ] Retire file based configurations

 ### Stage 4
 Mongodb finally has it's limitations.
 - [ ] Migrate to postgresql
+- [ ] Retire time based tasks like channel mirroring
+- [ ] A more comprehensive statistics page, uploads per day, downloads per day and such

 ### Stage ...
 Since this is my flagship software which I have developed more features will be added.
--- a/ad-hoc-test.py
+++ b/ad-hoc-test.py
@@ -1,13 +0,0 @@
-#Import os Library
-import os
-import datetime
-import json
-
-def print_current_time(give=False):
-    time = datetime.datetime.now().replace(microsecond=0)
-    print(f'--- It is {time} ---')
-    return time
-
-with open('lockfile', 'w') as file:
-        data = {'time': print_current_time(), 'PID': os.getpid()}
-        file.write(json.dumps(data, default=str))
--- a/ayta/init.py
+++ b/ayta/init.py
@@ -24,9 +24,11 @@ def create_app(test_config=None):
    # Celery Periodic tasks
    
    config['CELERY']['beat_schedule'] = {}
-    config['CELERY']['beat_schedule']['Renew WebSub endpoints'] = {'task': 'ayta.tasks.websub_renew_expiring', 'schedule': 4000}
-    config['CELERY']['beat_schedule']['Process WebSub data'] = {'task': 'ayta.tasks.websub_process_data', 'schedule': 100}
-    config['CELERY']['beat_schedule']['Queue up new videos in static channel playlists'] = {'task': 'ayta.tasks.playlist_to_queue', 'schedule': 50000}
+    config['CELERY']['beat_schedule']['Renew WebSub endpoints around every hour'] = {'task': 'ayta.tasks.websub_renew_expiring', 'schedule': 4000}
+    config['CELERY']['beat_schedule']['Process WebSub data around every two minutes'] = {'task': 'ayta.tasks.websub_process_data', 'schedule': 100}
+    config['CELERY']['beat_schedule']['Queue up new videos in static channel playlists about 2 times a day'] = {'task': 'ayta.tasks.playlist_to_queue', 'schedule': 50000}
+    config['CELERY']['beat_schedule']['Download around 123 videos spread out through the day'] = {'task': 'ayta.tasks.video_queue', 'schedule': 700}
+    config['CELERY']['beat_schedule']['Generate new statistiscs about every 3 hours'] = {'task': 'ayta.tasks.generate_statistics', 'schedule': 10000}
    
    # Celery task routing
    # Tasks not defined in this configuration will be routed to the default queue "celery"
@@ -51,6 +53,7 @@ def create_app(test_config=None):
    app.jinja_env.filters['current_time'] = filters.current_time
    app.jinja_env.filters['epoch_time'] = filters.epoch_time
    app.jinja_env.filters['epoch_date'] = filters.epoch_date
+    app.jinja_env.filters['datetime_date'] = filters.datetime_date
    
    from .blueprints import watch
    from .blueprints import index
--- a/ayta/blueprints/admin.py
+++ b/ayta/blueprints/admin.py
@@ -230,8 +230,9 @@ def queue():
        
    endpoints = get_nosql().queue_getEndpoints()
    queue = get_nosql().queue_getQueue()
+    count = len(list(queue.clone()))
   
-    return render_template('admin/queue.html', endpoints=endpoints, queue=queue)
+    return render_template('admin/queue.html', endpoints=endpoints, queue=queue, count=count)

@bp.route('/users', methods=['GET', 'POST'])
@login_required
--- a/ayta/blueprints/search.py
+++ b/ayta/blueprints/search.py
@@ -22,5 +22,4 @@ def base():
            
            return render_template('search/index.html', results=results, query=query)
                
-        
-    return render_template('search/index.html', stats=get_nosql().gen_stats())
+    return render_template('search/index.html', stats=get_nosql().statistics_get())
--- a/ayta/filters.py
+++ b/ayta/filters.py
@@ -27,6 +27,12 @@ def epoch_time(epoch):
        return datetime.fromtimestamp(epoch).strftime('%d %b %Y %H:%M:%S')
    except:
        return None
+        
+def datetime_date(obj):
+    try:
+        return obj.strftime('%d %b %Y %H:%M')
+    except:
+        return None

 def current_time(null=None, object=False):
    if object:
--- a/ayta/nosql.py
+++ b/ayta/nosql.py
@@ -44,6 +44,7 @@ class Mango:
        self.reports = self.db['reports']
        self.queue_endpoints = self.db['queue_endpoints']
        self.users = self.db['users']
+        self.statistics = self.db['statistics']
            
        self.ensure_indexes()
   
@@ -55,7 +56,11 @@ class Mango:
            'info_json': [
                ('id_1', True),
                ('channel_id_1', False),
-                ('uploader_1', False)
+                ('uploader_1', False),
+                ('timestamp', False),
+                ('upload_date', False),
+                ('filesize_approx', False),
+                ('_status', False)
            ],
            'websub_callbacks': [
                ('id', True)
@@ -92,16 +97,6 @@ class Mango:
    #             general functions          #
    ##########################################
    
-    def gen_stats(self):
-        """ Returns DICT; Channel statistics given the dict key """
-        stats = {}
-        
-        stats['videos'] = self.info_json.count_documents({})
-        stats['channels'] = self.channels.count_documents({})
-        stats['queue'] = self.download_queue.count_documents({})
-        
-        return stats
-            
    def search_videos(self, query):
        # search the index for the requested query. return limited keys
        results = self.info_json.find({"$text": {"$search": query}}, 
@@ -198,7 +193,9 @@ class Mango:
        
    def get_orphaned_videos(self):
        """ Returns a SET of YouTube video ID's which have info_jsons in the collection but no permanent channel is defined. SLOW OPERATION """
-        # Ok lemme explain. Perform inner join from channel collection on channel_id key. match only the fields which are empty. return video id
+        
+        # The following code I have commented out because the query took too long to proccess, timing the operation out
+        """# Ok lemme explain. Perform inner join from channel collection on channel_id key. match only the fields which are empty. return video id
        pipeline = [{'$match': {'_status': 'available'}},
                    {'$lookup': {'from': 'channels', 'localField': 'channel_id', 'foreignField': 'id', 'as': 'channel'}},
                    {'$match': {'channel': {'$size': 0}}},{'$project': {'id': 1}},
@@ -207,7 +204,20 @@ class Mango:
        results = self.info_json.aggregate(pipeline)
        ids = [result['id'] for result in results]

-        return tuple(ids)
+        return tuple(ids)"""
+        
+        # Reimplementing the query but in python, as I do not care about memory usage or data transfer
+        channels = self.channels.find({}, {'_id': 0, 'id': 1})
+        videos = self.info_json.find({'_status': 'available'}, {'_id': 0, 'channel_id': 1, 'id': 1})
+        
+        channels = set([x['id'] for x in channels])
+        orphaned = []
+        
+        for item in videos:
+            if item['channel_id'] not in channels:
+                orphaned.append(item['id'])
+        
+        return tuple(orphaned)
    
    def get_recent_videos(self, count=99):
        """ Returns a SET of YouTube video ID's which have been added last to the info_json collection """
@@ -455,6 +465,44 @@ class Mango:
                continue
            self.download_queue.update_one({'id': queueItem['id']}, {'$set': {'status': 'working'}})
            return queueItem
+            
+    ##########################################
+    #          STATISTICS FUNCTIONS          #
+    ##########################################
+    
+    def statistics_counts(self):
+        counts = {}
+        
+        counts['videos'] = f"{self.info_json.count_documents({'_status': 'available'})} videos in the archive"
+        counts['channels'] = f'{self.channels.count_documents({})} channels in the system'
+        counts['download_queue'] = f"{self.download_queue.count_documents({'status': 'queued'})} queued videos for download"
+        
+        return counts
+        
+    def statistics_sizes(self):
+        sizes = {}
+        data = self.info_json.find({'_status': 'available'}, {'filesize_approx': 1})
+        
+        total = 0
+        
+        for x in data:
+            size = x.get('filesize_approx')
+            if size:
+                total = total + int(size)
+
+        sizes['storage'] = f'{int(total / 1000000000000 + 5)} TB of storage'  # the 5 is the amount of TB of unjust approximation (updated feb 2025)
+
+        return sizes
+        
+    def statistics_generate(self):
+        data = self.statistics_sizes() | self.statistics_counts()
+        data['last_updated'] = self.datetime.utcnow()
+        
+        self.statistics.delete_many({})  # drop existing documents
+        self.statistics.insert_one(data)
+        
+    def statistics_get(self):
+        return self.statistics.find_one({}, {'_id': 0})
        
 ##########################################
 #           HELPER FUNCTIONS             #
--- a/ayta/tasks.py
+++ b/ayta/tasks.py
@@ -168,6 +168,12 @@ def websub_renew_expiring(hours=6):

@shared_task()
 def playlist_to_queue():
+    """
+    As there is still one cronjob based task running daily in the background, we have to make sure that gets hooked as well into the system.
+    The cronjob task gets the last 50 uploads for all channels and commits the playlist json into the database
+    This task makes sure we append the ID's that we got from the playlist into the download queue.
+    Should idealy be run after the cronjob completes, but I don't want to implement an API that does that, so this gets run twice a day.
+    """
    from .nosql import get_nosql
    import random
    from datetime import datetime, timedelta
@@ -187,6 +193,11 @@ def playlist_to_queue():
        for item in info['playlist']['entries']:
            videoId = item['id']
            get_nosql().queue_insertQueue(videoId, 'Playlist mirroring')
+            
+@shared_task()
+def generate_statistics():
+    from .nosql import get_nosql
+    get_nosql().statistics_generate()

 ##########################################
 #              TASK MODULES              #
--- a/ayta/templates/admin/queue.html
+++ b/ayta/templates/admin/queue.html
@@ -126,10 +126,13 @@
 </div>
 <div class="divider"></div>
 <div class="row">
-  <div class="col s6 l9">
+  <div class="col s4 l8">
    <h5>Queued ID's</h5>
  </div>
-  <div class="col s6 l3 m-4 input-field">
+  <div class="col s4 l1">
+    <p>{{ count }} items</p>
+  </div>
+  <div class="col s4 l3 m-4 input-field">
    <input id="filter_query" type="text">
    <label for="filter_query">Filter results</label>
  </div>
--- a/ayta/templates/search/index.html
+++ b/ayta/templates/search/index.html
@@ -5,68 +5,72 @@
 {% block content %}
 <div class="row">
  <div class="col s12 l3 m-4">
-	<h4>Search the archive</h4>
-	<p>Searching is currently partially working and will probably not work optimally for a long time until the database and backend is fully reworked.</p>
-	<p>In the meantime if you know the channel name and video title you can use local search on <a href="{{ url_for('channel.base') }}">this</a> page</p>
-	<img class="responsive-img" src="{{ url_for('static', filename='img/mongo_meme.png') }}">
-	{% if stats is defined %}
-	<div class="divider"></div>
-	<h5>Stats of the archive</h5>
+    <h4>Search the archive</h4>
+    <p>Searching is currently partially working and will probably not work optimally for a long time until the database and backend is fully reworked.</p>
+    <p>In the meantime if you know the channel name and video title you can use local search on <a href="{{ url_for('channel.base') }}">this</a> page</p>
+    <img class="responsive-img" src="{{ url_for('static', filename='img/mongo_meme.png') }}">
+    {% if stats is not none and stats is defined %}
+    <div class="divider"></div>
+    <h5>Stats of the archive</h5>
    <ul class="collection">
-	  {% for stat in stats %}
+      {% for stat in stats %}
      <li class="collection-item">
-	    <span class="title">{{ stat }}</span>
-		<p>{{ stats[stat] }}</p>
-	  </li>
-	  {% endfor %}
+        <!--<span class="title">{{ stat }}</span>-->
+        {% if stat == 'last_updated' %}
+        Last updated {{ stats[stat]|datetime_date }} UTC
+        {% else %}
+        {{ stats[stat] }}
+        {% endif %}
+      </li>
+      {% endfor %}
    </ul>
-	{% endif %}
+    {% endif %}
  </div>
  <div class="col s12 l9 m-4">
    <div class="row">
-	  <div class="col s6 offset-s3">
+      <div class="col s6 offset-s3">
        <img class="responsive-img" src="{{ url_for('static', filename='img/bing_chilling.png') }}">
-	  </div>
+      </div>
      <div class="col s12 center-align">
        <h5>"A big archive needs a search function." -Sun Tzu</h5>
      </div>
-	</div>
-	<div class="divider"></div>
-	<form method="post" class="">
-	  <div class="row">
-	     <div class="col s12 m-4 input-field">
-	       <input id="first_name" name="query" type="text" placeholder='Search the archive!' maxlength="64" value="{{ query }}">
+    </div>
+    <div class="divider"></div>
+    <form method="post" class="">
+      <div class="row">
+         <div class="col s12 m-4 input-field">
+           <input id="first_name" name="query" type="text" placeholder='Search the archive!' maxlength="64" value="{{ query }}">
           <label for="first_name">Searching in video titles, uploader names and tags.</label>
           <span class="supporting-text">Input will be interpreted as keywords. You can search for literal text by using quotes("). Or exclude by prepending minus (-).</span>
-		 </div>
-		 <div class="col s12 m-4">
+         </div>
+         <div class="col s12 m-4">
         <button class="btn icon-right waves-effect waves-light" type="submit" name="task" value="search">Search</button>
-	     </div>
-	  </div>
-	</form>
-	{% if results is defined %}
-	<div class="divider"></div>
-	
+         </div>
+      </div>
+    </form>
+    {% if results is defined %}
+    <div class="divider"></div>
+    
    <table class="striped highlight responsive-table">
      <thead>
        <tr>
          <th>Title</th>
          <th>Uploader</th>
-		  <th>Date</th>
+          <th>Date</th>
        </tr>
      </thead>
      <tbody>
-		{% for result in results %}
+        {% for result in results %}
        <tr>
          <td><a href="{{ url_for('watch.base') }}?v={{ result.get('id') }}">{{ result.get('title') }}</a></td>
          <td><a href="{{ url_for('channel.channel', channelId=result.get('channel_id')) }}">{{ result.get('uploader') }}</a></td>
-		  <td>{{ result.get('upload_date')|pretty_time }}</td>
+          <td>{{ result.get('upload_date')|pretty_time }}</td>
        </tr>
-		{% endfor %}
+        {% endfor %}
      </tbody>
    </table>
-	{% if results|length == 0 %}<h6>No results. Relax the search terms more please!</h6>{% else %}<p>Not the results you were looking for? Try adding quotes ("") around important words.</p>{% endif %}
-	{% endif %}
+    {% if results|length == 0 %}<h6>No results. Relax the search terms more please!</h6>{% else %}<p>Not the results you were looking for? Try adding quotes ("") around important words.</p>{% endif %}
+    {% endif %}
  </div>
 </div>
 {% endblock %}
--- a/ayta/templates/watch/index.html
+++ b/ayta/templates/watch/index.html
@@ -6,7 +6,7 @@
 <meta property="og:title" content="{{ render.get('info').get('title') }}" />
 <meta property="og:type" content="website" />
 <meta property="og:url" content="{{ url_for('watch.base') }}?v={{ render.get('info').get('id') }}" />
-<meta property="og:image" content="https://archive.ventilaar.net/videos/automatic/{{ render.get('info').get('channel_id') }}/{{ render.get('info').get('id') }}/{{ render.get('info').get('title') }}.jpg" />
+<meta property="og:image" content="https://archive.ventilaar.net/videos/automatic/{{ render.get('info').get('channel_id') }}/{{ render.get('info').get('id') }}/{{ render.get('info').get('_title_slug') }}.jpg" />
 <meta property="og:description" content="{{ render.get('info').get('description', '')|truncate(100) }}" />
 {% endblock %}

--- a/one_offs/add_failed_queue_to_unavailable.py
+++ b/one_offs/add_failed_queue_to_unavailable.py
@@ -0,0 +1,20 @@
+from ayta.nosql import Mango
+#import ayta
+#app = ayta.create_app()
+mango = Mango('mongodb://root:example@192.168.66.140:27017')
+
+data = mango.download_queue.find({'status': 'failed'})
+for x in data:
+    vId = x['id']
+    lines = x['fail_reason'].splitlines()
+    error = lines[-1]
+    check = "This video has been removed for violating YouTube's Terms of Service"
+    
+    if check in error:
+        print(vId)
+        mango.info_json.insert_one({'id': vId, '_status': 'unavailable', 
+        '_status_description': f'Video is unavailable because YouTube said: {check}'})
+        mango.queue_deleteQueue(vId)
+    else:
+        print(error)
+print('done')
--- a/one_offs/archive_size.py
+++ b/one_offs/archive_size.py
@@ -0,0 +1,18 @@
+from ayta.nosql import Mango
+#import ayta
+#app = ayta.create_app()
+mango = Mango('mongodb://root:example@192.168.66.140:27017')
+
+data = mango.info_json.find({'_status': 'available'}, {'filesize_approx': 1})
+
+total = 0
+
+for x in data:
+    size = x.get('filesize_approx')
+    if size:
+        total = total + int(size)
+
+# the 5000 is the amount of GB of unjust approximation
+total = int(total / 1000000000 + 5000)
+
+print(f'Approximate size: {total} GB')
--- a/one_offs/stats_downloads_per_day.py
+++ b/one_offs/stats_downloads_per_day.py
@@ -0,0 +1,37 @@
+from ayta.nosql import Mango
+import matplotlib.pyplot as plt
+from datetime import datetime, timedelta
+#import ayta
+#app = ayta.create_app()
+mango = Mango('mongodb://root:example@192.168.66.140:27017')
+
+pivot = datetime.utcnow() - timedelta(days=90)
+pivot = int(pivot.timestamp())
+
+data = mango.info_json.find({'_status': 'available', 'timestamp': {'$gt': pivot}}, {'epoch': 1})
+
+stat = {}
+
+for x in data:
+    epoch = x['epoch']
+    day = datetime.fromtimestamp(epoch).strftime('%Y%m%d')
+    
+    if day not in stat:
+        stat[day] = 1
+    else:
+        stat[day] = stat[day] + 1
+
+dates = list(stat.keys())
+values = list(stat.values())
+
+plt.figure(figsize=(16, 8))  # Set the figure size
+plt.bar(dates, values)       # Create the bar chart
+
+# Customize the x-axis labels to be vertical
+plt.xticks(rotation=45, ha='right')  # Rotate xticklabels by 45 degrees and align them to the right
+plt.xlabel('Date')           # Label for x-axis
+plt.ylabel('Counts')         # Label for y-axis
+plt.title('Bar Graph of Counts by Date')  # Title of the graph
+
+# Display the graph
+plt.show()
--- a/one_offs/stats_uploads_per_day.py
+++ b/one_offs/stats_uploads_per_day.py
@@ -0,0 +1,35 @@
+from ayta.nosql import Mango
+import matplotlib.pyplot as plt
+from datetime import datetime, timedelta
+#import ayta
+#app = ayta.create_app()
+mango = Mango('mongodb://root:example@192.168.66.140:27017')
+
+pivot = '20220101'
+
+data = mango.info_json.find({'_status': 'available', 'upload_date': {'$gt': pivot}}, {'upload_date': 1})
+
+stat = {}
+
+for x in data:
+    day = x['upload_date']
+    
+    if day not in stat:
+        stat[day] = 1
+    else:
+        stat[day] = stat[day] + 1
+
+dates = list(stat.keys())
+values = list(stat.values())
+
+plt.figure(figsize=(16, 8))  # Set the figure size
+plt.bar(dates, values)       # Create the bar chart
+
+# Customize the x-axis labels to be vertical
+plt.xticks(rotation=45, ha='right')  # Rotate xticklabels by 45 degrees and align them to the right
+plt.xlabel('Date')           # Label for x-axis
+plt.ylabel('Counts')         # Label for y-axis
+plt.title('Bar Graph of Counts by Date')  # Title of the graph
+
+# Display the graph
+plt.show()
Author	SHA1	Message	Date
Ventilaar	570ac88b99	Reimplement orphaned video listing and more accurate stats Some checks failed Update worker server / build-and-publish (release) Has been cancelled Details Generate docker image / build-and-publish (release) Successful in 19s Details	2025-02-11 22:55:21 +01:00
Ventilaar	c51a72ec2b	minor typo Some checks failed Generate docker image / build-and-publish (release) Failing after 7s Details Update worker server / build-and-publish (release) Successful in 16s Details	2025-02-11 17:41:20 +01:00
Ventilaar	fa8f11dad6	Major update All checks were successful Update worker server / build-and-publish (release) Successful in 13s Details Generate docker image / build-and-publish (release) Successful in 59s Details	2025-02-11 13:12:10 +01:00