feat: Background workers

- analyzer.py: Issue analysis pipeline (JIRA -> LLM -> PR) - indexer.py: Code indexing pipeline (Bitbucket -> Embeddings -> Qdrant) - Redis queue-based processing - Progress tracking and status updates
2026-02-18 14:03:34 -03:00 · 2026-02-18 14:03:34 -03:00 · 27b72e3ccd
parent 011a93c5b9
commit 27b72e3ccd
4 changed files with 475 additions and 0 deletions
--- a/workers/init.py
+++ b/workers/init.py
@ -0,0 +1 @@
+"""Workers package."""
--- a/workers/analyzer.py
+++ b/workers/analyzer.py
@ -0,0 +1,208 @@
+"""
+Issue Analyzer Worker - Background processing of JIRA issues.
+"""
+import asyncio
+import json
+import logging
+import os
+from datetime import datetime
+from typing import Optional
+
+import redis.asyncio as redis
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class IssueAnalyzer:
+    """
+    Background worker that processes JIRA issues from the queue.
+    
+    Flow:
+    1. Poll Redis queue for new issues
+    2. Fetch issue details from JIRA
+    3. Search for relevant code in vector DB
+    4. Send to LLM for analysis
+    5. Generate fix proposal
+    6. Create PR if confidence is high enough
+    7. Post analysis back to JIRA
+    """
+    
+    def __init__(
+        self,
+        redis_url: str = "redis://localhost:6379",
+        queue_name: str = "issues:pending",
+    ):
+        self.redis_url = redis_url
+        self.queue_name = queue_name
+        self.redis: Optional[redis.Redis] = None
+        self.running = False
+    
+    async def connect(self):
+        """Connect to Redis."""
+        self.redis = redis.from_url(self.redis_url)
+        logger.info(f"🔌 Connected to Redis: {self.redis_url}")
+    
+    async def disconnect(self):
+        """Disconnect from Redis."""
+        if self.redis:
+            await self.redis.close()
+    
+    async def run(self):
+        """Main worker loop."""
+        self.running = True
+        await self.connect()
+        
+        logger.info("🚀 Issue Analyzer worker started")
+        
+        while self.running:
+            try:
+                # Block waiting for new items (timeout 5s)
+                result = await self.redis.blpop(self.queue_name, timeout=5)
+                
+                if result:
+                    _, data = result
+                    issue_data = json.loads(data)
+                    await self.process_issue(issue_data)
+            except Exception as e:
+                logger.error(f"❌ Error in worker loop: {e}")
+                await asyncio.sleep(5)  # Back off on error
+        
+        await self.disconnect()
+        logger.info("👋 Issue Analyzer worker stopped")
+    
+    async def process_issue(self, issue_data: dict):
+        """
+        Process a single issue.
+        
+        Args:
+            issue_data: Dict with issue key and metadata
+        """
+        issue_key = issue_data.get("key")
+        logger.info(f"📋 Processing issue: {issue_key}")
+        
+        try:
+            # Update status
+            await self.update_status(issue_key, "analyzing")
+            
+            # 1. Fetch full issue details from JIRA
+            issue_details = await self.fetch_issue_details(issue_key)
+            
+            # 2. Extract relevant context
+            description = issue_details.get("fields", {}).get("description", "")
+            summary = issue_details.get("fields", {}).get("summary", "")
+            
+            # 3. Search for relevant code
+            code_context = await self.search_relevant_code(summary, description)
+            
+            # 4. Get business rules for the affected module
+            module = await self.identify_module(summary, description)
+            business_rules = await self.get_module_rules(module)
+            
+            # 5. Send to LLM for analysis
+            analysis = await self.analyze_with_llm(
+                issue_description=f"{summary}\n\n{description}",
+                code_context=code_context,
+                business_rules=business_rules,
+            )
+            
+            # 6. Store analysis result
+            await self.store_analysis(issue_key, analysis)
+            
+            # 7. If confidence is high, create PR
+            if analysis.get("confidence", 0) >= 0.75:
+                pr_url = await self.create_pull_request(issue_key, analysis)
+                analysis["pr_url"] = pr_url
+            
+            # 8. Post comment to JIRA
+            await self.post_jira_comment(issue_key, analysis)
+            
+            # Update status
+            status = "pr_created" if analysis.get("pr_url") else "analyzed"
+            await self.update_status(issue_key, status)
+            
+            logger.info(f"✅ Completed analysis: {issue_key} (confidence: {analysis.get('confidence', 0):.0%})")
+            
+        except Exception as e:
+            logger.error(f"❌ Failed to process {issue_key}: {e}")
+            await self.update_status(issue_key, "failed", str(e))
+    
+    async def fetch_issue_details(self, issue_key: str) -> dict:
+        """Fetch issue details from JIRA."""
+        # TODO: Implement JIRA client call
+        logger.info(f"🔍 Fetching issue details: {issue_key}")
+        return {}
+    
+    async def search_relevant_code(self, summary: str, description: str) -> str:
+        """Search vector DB for relevant code."""
+        # TODO: Implement vector search
+        logger.info("🔍 Searching for relevant code...")
+        return ""
+    
+    async def identify_module(self, summary: str, description: str) -> Optional[str]:
+        """Identify which business module the issue relates to."""
+        # TODO: Implement module identification
+        return None
+    
+    async def get_module_rules(self, module: Optional[str]) -> str:
+        """Get business rules for a module."""
+        # TODO: Load from database
+        return ""
+    
+    async def analyze_with_llm(
+        self,
+        issue_description: str,
+        code_context: str,
+        business_rules: str,
+    ) -> dict:
+        """Send to LLM for analysis."""
+        # TODO: Implement LLM service call
+        logger.info("🤖 Analyzing with LLM...")
+        return {
+            "root_cause": "Analysis pending",
+            "affected_files": [],
+            "proposed_fix": "",
+            "confidence": 0.0,
+            "explanation": "",
+        }
+    
+    async def store_analysis(self, issue_key: str, analysis: dict):
+        """Store analysis result in database."""
+        # TODO: Implement database storage
+        logger.info(f"💾 Storing analysis for {issue_key}")
+    
+    async def create_pull_request(self, issue_key: str, analysis: dict) -> Optional[str]:
+        """Create a pull request with the proposed fix."""
+        # TODO: Implement Bitbucket PR creation
+        logger.info(f"📝 Creating PR for {issue_key}")
+        return None
+    
+    async def post_jira_comment(self, issue_key: str, analysis: dict):
+        """Post analysis as a comment on the JIRA issue."""
+        # TODO: Implement JIRA comment
+        logger.info(f"💬 Posting comment to {issue_key}")
+    
+    async def update_status(self, issue_key: str, status: str, error: str = None):
+        """Update issue status in database and Redis."""
+        # TODO: Implement status update
+        logger.info(f"📊 Status update: {issue_key} -> {status}")
+    
+    def stop(self):
+        """Signal worker to stop."""
+        self.running = False
+
+
+async def main():
+    """Entry point for the worker."""
+    worker = IssueAnalyzer(
+        redis_url=os.getenv("REDIS_URL", "redis://localhost:6379"),
+    )
+    
+    try:
+        await worker.run()
+    except KeyboardInterrupt:
+        worker.stop()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/workers/indexer.py
+++ b/workers/indexer.py
@ -0,0 +1,261 @@
+"""
+Code Indexer Worker - Background indexing of Bitbucket repositories.
+"""
+import asyncio
+import logging
+import os
+from datetime import datetime
+from pathlib import Path
+from typing import Optional, List, Dict, Any
+
+import redis.asyncio as redis
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class CodeIndexer:
+    """
+    Background worker that indexes code from Bitbucket repositories.
+    
+    Flow:
+    1. Poll Redis queue for indexing jobs
+    2. Clone/pull repository
+    3. Parse COBOL/SQL/JCL files
+    4. Generate embeddings
+    5. Store in Qdrant vector database
+    """
+    
+    # File extensions to index
+    INDEXABLE_EXTENSIONS = {
+        ".cbl": "cobol",
+        ".cob": "cobol",
+        ".cpy": "copybook",
+        ".sql": "sql",
+        ".jcl": "jcl",
+        ".proc": "jcl_proc",
+    }
+    
+    def __init__(
+        self,
+        redis_url: str = "redis://localhost:6379",
+        qdrant_url: str = "http://localhost:6333",
+        work_dir: str = "/tmp/aci-indexer",
+        queue_name: str = "indexer:jobs",
+    ):
+        self.redis_url = redis_url
+        self.qdrant_url = qdrant_url
+        self.work_dir = Path(work_dir)
+        self.queue_name = queue_name
+        self.redis: Optional[redis.Redis] = None
+        self.running = False
+        
+        # Create work directory
+        self.work_dir.mkdir(parents=True, exist_ok=True)
+    
+    async def connect(self):
+        """Connect to Redis."""
+        self.redis = redis.from_url(self.redis_url)
+        logger.info(f"🔌 Connected to Redis: {self.redis_url}")
+    
+    async def disconnect(self):
+        """Disconnect from Redis."""
+        if self.redis:
+            await self.redis.close()
+    
+    async def run(self):
+        """Main worker loop."""
+        self.running = True
+        await self.connect()
+        
+        logger.info("🚀 Code Indexer worker started")
+        
+        while self.running:
+            try:
+                # Block waiting for new jobs (timeout 10s)
+                result = await self.redis.blpop(self.queue_name, timeout=10)
+                
+                if result:
+                    _, data = result
+                    job = eval(data)  # Simple deserialization
+                    await self.process_job(job)
+            except Exception as e:
+                logger.error(f"❌ Error in worker loop: {e}")
+                await asyncio.sleep(5)
+        
+        await self.disconnect()
+        logger.info("👋 Code Indexer worker stopped")
+    
+    async def process_job(self, job: dict):
+        """
+        Process an indexing job.
+        
+        Args:
+            job: Dict with repository info and options
+        """
+        repo_name = job.get("name")
+        repo_url = job.get("url")
+        
+        logger.info(f"📦 Indexing repository: {repo_name}")
+        
+        try:
+            # 1. Clone or update repository
+            repo_path = await self.clone_or_update(repo_name, repo_url)
+            
+            # 2. Find indexable files
+            files = self.find_indexable_files(repo_path)
+            logger.info(f"📁 Found {len(files)} indexable files")
+            
+            # 3. Process files in batches
+            total_chunks = 0
+            batch_size = 10
+            
+            for i in range(0, len(files), batch_size):
+                batch = files[i:i + batch_size]
+                chunks = await self.process_file_batch(batch, repo_name)
+                total_chunks += chunks
+                
+                # Report progress
+                progress = min(100, ((i + batch_size) / len(files)) * 100)
+                await self.report_progress(repo_name, progress)
+            
+            # 4. Update repository status
+            await self.update_repo_status(repo_name, total_chunks)
+            
+            logger.info(f"✅ Indexed {repo_name}: {total_chunks} chunks from {len(files)} files")
+            
+        except Exception as e:
+            logger.error(f"❌ Failed to index {repo_name}: {e}")
+            await self.update_repo_status(repo_name, 0, error=str(e))
+    
+    async def clone_or_update(self, name: str, url: str) -> Path:
+        """Clone repository or pull latest changes."""
+        repo_path = self.work_dir / name
+        
+        if repo_path.exists():
+            logger.info(f"📥 Pulling latest changes: {name}")
+            # TODO: Implement git pull
+        else:
+            logger.info(f"📥 Cloning repository: {name}")
+            # TODO: Implement git clone
+            repo_path.mkdir(parents=True, exist_ok=True)
+        
+        return repo_path
+    
+    def find_indexable_files(self, repo_path: Path) -> List[Path]:
+        """Find all files that should be indexed."""
+        files = []
+        
+        for ext in self.INDEXABLE_EXTENSIONS:
+            files.extend(repo_path.rglob(f"*{ext}"))
+        
+        return files
+    
+    async def process_file_batch(self, files: List[Path], repo_name: str) -> int:
+        """Process a batch of files and return chunk count."""
+        total_chunks = 0
+        
+        for file_path in files:
+            try:
+                content = file_path.read_text(encoding="latin-1")
+                file_type = self.INDEXABLE_EXTENSIONS.get(file_path.suffix.lower())
+                
+                # Parse into chunks
+                chunks = self.parse_file(content, file_path, file_type)
+                
+                # Generate embeddings and store
+                for chunk in chunks:
+                    await self.index_chunk(chunk, repo_name)
+                    total_chunks += 1
+                    
+            except Exception as e:
+                logger.warning(f"⚠️ Failed to process {file_path}: {e}")
+        
+        return total_chunks
+    
+    def parse_file(self, content: str, path: Path, file_type: str) -> List[Dict[str, Any]]:
+        """Parse a file into indexable chunks."""
+        chunks = []
+        
+        if file_type in ("cobol", "copybook"):
+            chunks = self.parse_cobol(content, path)
+        elif file_type == "sql":
+            chunks = self.parse_sql(content, path)
+        elif file_type in ("jcl", "jcl_proc"):
+            chunks = self.parse_jcl(content, path)
+        
+        return chunks
+    
+    def parse_cobol(self, content: str, path: Path) -> List[Dict[str, Any]]:
+        """Parse COBOL program into chunks."""
+        # TODO: Implement full COBOL parsing
+        # For now, create one chunk per file
+        return [{
+            "file_path": str(path),
+            "content": content[:4000],  # Limit size
+            "type": "cobol",
+            "start_line": 1,
+            "end_line": content.count("\n"),
+        }]
+    
+    def parse_sql(self, content: str, path: Path) -> List[Dict[str, Any]]:
+        """Parse SQL file into chunks."""
+        return [{
+            "file_path": str(path),
+            "content": content[:4000],
+            "type": "sql",
+            "start_line": 1,
+            "end_line": content.count("\n"),
+        }]
+    
+    def parse_jcl(self, content: str, path: Path) -> List[Dict[str, Any]]:
+        """Parse JCL file into chunks."""
+        return [{
+            "file_path": str(path),
+            "content": content[:4000],
+            "type": "jcl",
+            "start_line": 1,
+            "end_line": content.count("\n"),
+        }]
+    
+    async def index_chunk(self, chunk: Dict[str, Any], repo_name: str):
+        """Generate embedding and store chunk in Qdrant."""
+        # TODO: Implement embedding generation and Qdrant storage
+        pass
+    
+    async def report_progress(self, repo_name: str, progress: float):
+        """Report indexing progress."""
+        if self.redis:
+            await self.redis.set(f"indexer:progress:{repo_name}", str(progress))
+    
+    async def update_repo_status(
+        self,
+        repo_name: str,
+        chunk_count: int,
+        error: str = None,
+    ):
+        """Update repository status in database."""
+        # TODO: Implement database update
+        status = "indexed" if not error else "failed"
+        logger.info(f"📊 Repo status: {repo_name} -> {status} ({chunk_count} chunks)")
+    
+    def stop(self):
+        """Signal worker to stop."""
+        self.running = False
+
+
+async def main():
+    """Entry point for the worker."""
+    worker = CodeIndexer(
+        redis_url=os.getenv("REDIS_URL", "redis://localhost:6379"),
+        qdrant_url=os.getenv("QDRANT_URL", "http://localhost:6333"),
+    )
+    
+    try:
+        await worker.run()
+    except KeyboardInterrupt:
+        worker.stop()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
--- a/workers/requirements.txt
+++ b/workers/requirements.txt
@ -0,0 +1,5 @@
+redis>=5.0.0
+httpx>=0.26.0
+sentence-transformers>=2.2.0
+qdrant-client>=1.7.0
+python-dotenv>=1.0.0