Reimplement orphaned video listing and more accurate stats

2025-02-11 22:55:21 +01:00
1 changed files with 19 additions and 4 deletions
--- a/ayta/nosql.py
+++ b/ayta/nosql.py
@@ -193,7 +193,9 @@ class Mango:
        
    def get_orphaned_videos(self):
        """ Returns a SET of YouTube video ID's which have info_jsons in the collection but no permanent channel is defined. SLOW OPERATION """
-        # Ok lemme explain. Perform inner join from channel collection on channel_id key. match only the fields which are empty. return video id
+        
+        # The following code I have commented out because the query took too long to proccess, timing the operation out
+        """# Ok lemme explain. Perform inner join from channel collection on channel_id key. match only the fields which are empty. return video id
        pipeline = [{'$match': {'_status': 'available'}},
                    {'$lookup': {'from': 'channels', 'localField': 'channel_id', 'foreignField': 'id', 'as': 'channel'}},
                    {'$match': {'channel': {'$size': 0}}},{'$project': {'id': 1}},
@@ -202,7 +204,20 @@ class Mango:
        results = self.info_json.aggregate(pipeline)
        ids = [result['id'] for result in results]

-        return tuple(ids)
+        return tuple(ids)"""
+        
+        # Reimplementing the query but in python, as I do not care about memory usage or data transfer
+        channels = self.channels.find({}, {'_id': 0, 'id': 1})
+        videos = self.info_json.find({'_status': 'available'}, {'_id': 0, 'channel_id': 1, 'id': 1})
+        
+        channels = set([x['id'] for x in channels])
+        orphaned = []
+        
+        for item in videos:
+            if item['channel_id'] not in channels:
+                orphaned.append(item['id'])
+        
+        return tuple(orphaned)
    
    def get_recent_videos(self, count=99):
        """ Returns a SET of YouTube video ID's which have been added last to the info_json collection """
@@ -458,9 +473,9 @@ class Mango:
    def statistics_counts(self):
        counts = {}
        
-        counts['videos'] = f'{self.info_json.count_documents({})} videos in the archive'
+        counts['videos'] = f"{self.info_json.count_documents({'_status': 'available'})} videos in the archive"
        counts['channels'] = f'{self.channels.count_documents({})} channels in the system'
-        counts['download_queue'] = f'{self.download_queue.count_documents({})} queued videos for download'
+        counts['download_queue'] = f"{self.download_queue.count_documents({'status': 'queued'})} queued videos for download"
        
        return counts