""" Janitor Module Handles background cleanup of expired tasks and their output directories. """ import threading import time import shutil from pathlib import Path import sqlite3 from flask import current_app def start_janitor(app): """Start the background janitor thread for cleaning up expired tasks.""" def worker(): with app.app_context(): while True: try: cleanup_expired_tasks() except Exception as e: app.logger.error(f"Janitor cleanup error: {str(e)}", exc_info=True) finally: # Sleep for 30 seconds before next cleanup cycle time.sleep(30) # Create daemon thread so it doesn't prevent app shutdown thread = threading.Thread(target=worker, daemon=True, name="janitor") thread.start() app.logger.info("Janitor thread started for background cleanup") def cleanup_expired_tasks(): """Clean up tasks that have exceeded their deletion time.""" db_path = _get_db_path() dry_run_config = current_app.config.get('JANITOR_DRY_RUN', 'false') if isinstance(dry_run_config, str): dry_run = dry_run_config.lower() == 'true' else: dry_run = bool(dry_run_config) try: conn = sqlite3.connect(str(db_path), check_same_thread=False) conn.row_factory = sqlite3.Row conn.execute("PRAGMA foreign_keys=ON") # Log current time for debugging (using same timezone as setting) current_time = conn.execute("SELECT datetime('now', '+8 hours')").fetchone()[0] current_app.logger.info(f"Janitor cleanup check at: {current_time}") # Debug: Check for tasks with delete_after_at set debug_rows = conn.execute(""" SELECT task_id, delete_after_at, downloaded_at FROM tasks WHERE delete_after_at IS NOT NULL AND deleted_at IS NULL """).fetchall() if debug_rows: current_app.logger.info(f"Found {len(debug_rows)} tasks with delete_after_at set:") for row in debug_rows: current_app.logger.info(f" Task {row['task_id']}: delete_at={row['delete_after_at']}, downloaded_at={row['downloaded_at']}") # Find tasks that need to be deleted rows = conn.execute(""" SELECT task_id, output_dir FROM tasks WHERE delete_after_at IS NOT NULL AND deleted_at IS NULL AND delete_after_at <= datetime('now', '+8 hours') """).fetchall() if not rows: return # No tasks to clean up current_app.logger.info(f"Janitor found {len(rows)} expired tasks to clean up") upload_base = Path(current_app.config.get('UPLOAD_FOLDER') or '') output_base = Path(current_app.config.get('OUTPUT_FOLDER') or '') for row in rows: task_id = row['task_id'] output_dir = row['output_dir'] try: delete_targets = [] # 记录在库中的 output_dir if output_dir: p = Path(output_dir) delete_targets.append(p) # 兜底:按约定 outputs/ derived_output_dir = output_base / task_id if output_base else None if derived_output_dir and derived_output_dir not in delete_targets: delete_targets.append(derived_output_dir) # 同时删除 uploads/ derived_upload_dir = upload_base / task_id if upload_base else None if derived_upload_dir: delete_targets.append(derived_upload_dir) if dry_run: for tgt in delete_targets: if tgt: current_app.logger.info(f"[DRY RUN] Would delete task {task_id} path: {tgt}") else: # 实际删除 for tgt in delete_targets: try: if tgt and tgt.exists(): shutil.rmtree(tgt, ignore_errors=True) current_app.logger.info(f"Deleted path for task {task_id}: {tgt}") else: current_app.logger.warning(f"Path not found for task {task_id}: {tgt}") except Exception as e: current_app.logger.error(f"Failed to delete path {tgt} for task {task_id}: {e}") # Hard delete from database conn.execute( "DELETE FROM tasks WHERE task_id = ?", (task_id,) ) conn.commit() current_app.logger.info(f"Hard deleted task {task_id} from database") except Exception as e: current_app.logger.error(f"Failed to delete task {task_id}: {str(e)}", exc_info=True) # Continue with other tasks even if one fails except Exception as e: current_app.logger.error(f"Database error during cleanup: {str(e)}", exc_info=True) finally: if 'conn' in locals(): conn.close() def reconcile_tasks_on_startup(): """Reconcile task states on application startup.""" db_path = _get_db_path() try: conn = sqlite3.connect(str(db_path), check_same_thread=False) conn.execute("PRAGMA foreign_keys=ON") # Fix tasks that were downloaded but don't have delete_after_at set # This handles cases where the app crashed after marking downloaded but before setting delete time conn.execute(""" UPDATE tasks SET delete_after_at = COALESCE(delete_after_at, datetime(downloaded_at, '+10 minutes')) WHERE downloaded_at IS NOT NULL AND deleted_at IS NULL AND delete_after_at IS NULL """) updated_count = conn.total_changes if updated_count > 0: current_app.logger.info(f"Startup reconciliation: Fixed delete_after_at for {updated_count} downloaded tasks") # Check for tasks with output directories that no longer exist # This helps clean up database entries for manually deleted directories rows = conn.execute(""" SELECT task_id, output_dir FROM tasks WHERE output_dir IS NOT NULL AND deleted_at IS NULL """).fetchall() orphaned_count = 0 for row in rows: task_id_from_db = row[0] # task_id是第一个字段 output_dir_from_db = row[1] # output_dir是第二个字段 if not Path(output_dir_from_db).exists(): conn.execute( "DELETE FROM tasks WHERE task_id = ?", (task_id_from_db,) ) orphaned_count += 1 if orphaned_count > 0: current_app.logger.info(f"Startup reconciliation: Marked {orphaned_count} tasks as deleted (directories not found)") # Backfill output_dir for rows with NULL, using OUTPUT_FOLDER/ output_base = Path(current_app.config.get('OUTPUT_FOLDER') or '') if output_base: rows2 = conn.execute(""" SELECT task_id FROM tasks WHERE (output_dir IS NULL OR output_dir = '') AND deleted_at IS NULL """).fetchall() for r in rows2: task_id_from_db = r[0] # task_id是第一个字段 guess = output_base / task_id_from_db conn.execute("UPDATE tasks SET output_dir = ? WHERE task_id = ?", (str(guess), task_id_from_db)) current_app.logger.info(f"Backfilled output_dir for task {task_id_from_db}: {guess}") conn.commit() except Exception as e: current_app.logger.error(f"Startup reconciliation error: {str(e)}", exc_info=True) finally: if 'conn' in locals(): conn.close() def _get_db_path(): """Get database file path from app config.""" from .db import get_db_path as get_config_db_path return get_config_db_path(current_app)