Lab 09: QA 에이전트 구현
고급
마감: 2026-05-06
1.
qa_runner.py
2.
code_reviewer.py
3.
qa_agent.py
pipeline_e2e.py
목표
QAAgent클래스 구현 — 자동 테스트 실행, 커버리지 측정, 코드 리뷰- 코더 에이전트와의 피드백 루프(feedback loop) 구현
- Planner → Coder → QA 3단계 파이프라인 end-to-end 시연
QA 에이전트의 역할
QA 에이전트는 코더의 출력을 검증하고 피드백을 제공하는 품질 관문(quality gate)이다.
CoderAgent ──코드변경──▶ QAAgent ├── pytest 실행 ├── coverage 측정 ├── LLM 코드 리뷰 └── 피드백 판정 │ 통과 ────┴──── 실패 → CoderAgent (재시도) │ ReviewerAgent구현 요구사항
1. qa_runner.py — 테스트 실행 엔진
import subprocessimport jsonimport refrom dataclasses import dataclassfrom pathlib import Path
@dataclassclass TestResult: passed: int failed: int errors: int duration_sec: float coverage_pct: float | None failed_tests: list[str] full_output: str
@property def all_passed(self) -> bool: return self.failed == 0 and self.errors == 0
def to_summary(self) -> str: status = "PASS" if self.all_passed else "FAIL" lines = [ f"[{status}] 통과: {self.passed} | 실패: {self.failed} | 오류: {self.errors}", f"실행 시간: {self.duration_sec:.2f}초", ] if self.coverage_pct is not None: lines.append(f"커버리지: {self.coverage_pct:.1f}%") if self.failed_tests: lines.append("실패 테스트:") lines.extend(f" - {t}" for t in self.failed_tests) return "\n".join(lines)
class TestRunner: def __init__(self, test_dir: str = "tests/"): self.test_dir = test_dir
def run(self, with_coverage: bool = True) -> TestResult: cmd = ["python", "-m", "pytest", self.test_dir, "-v", "--tb=short", "--json-report"] if with_coverage: cmd.extend([ f"--cov={self.test_dir.replace('tests/', 'src/')}", "--cov-report=json" ])
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120) return self._parse_output(result)
def _parse_output(self, result: subprocess.CompletedProcess) -> TestResult: output = result.stdout + result.stderr
# pytest JSON 리포트 파싱 (--json-report 사용 시) report_path = Path(".report.json") if report_path.exists(): report = json.loads(report_path.read_text()) summary = report.get("summary", {}) failed_tests = [ t["nodeid"] for t in report.get("tests", []) if t["outcome"] in ("failed", "error") ] else: # 폴백: 텍스트 파싱 passed = len(re.findall(r"PASSED", output)) failed = len(re.findall(r"FAILED", output)) summary = {"passed": passed, "failed": failed, "error": 0} failed_tests = re.findall(r"FAILED (.+?) -", output)
# 커버리지 파싱 coverage_pct = None cov_path = Path("coverage.json") if cov_path.exists(): cov = json.loads(cov_path.read_text()) coverage_pct = cov.get("totals", {}).get("percent_covered")
duration_match = re.search(r"(\d+\.\d+)s", output) duration = float(duration_match.group(1)) if duration_match else 0.0
return TestResult( passed=summary.get("passed", 0), failed=summary.get("failed", 0), errors=summary.get("error", 0), duration_sec=duration, coverage_pct=coverage_pct, failed_tests=failed_tests, full_output=output[:2000] )2. code_reviewer.py — LLM 기반 코드 리뷰어
import subprocessimport anthropicfrom dataclasses import dataclass
@dataclassclass ReviewResult: severity: str # "pass" | "warn" | "block" issues: list[str] suggestions: list[str] score: int # 0-100
def should_block(self) -> bool: return self.severity == "block" or self.score < 40
REVIEW_SYSTEM = """You are a strict code reviewer. Review the provided git diff and identify:1. Security vulnerabilities (BLOCK if found)2. Logic errors that tests might miss (BLOCK if severe)3. Code style issues (WARN)4. Performance concerns (WARN)
Respond in JSON:{ "severity": "pass|warn|block", "issues": ["issue1", "issue2"], "suggestions": ["suggestion1"], "score": 0-100, "reasoning": "brief explanation"}"""
class CodeReviewer: def __init__(self): self.client = anthropic.Anthropic()
def review_diff(self, diff: str) -> ReviewResult: if not diff.strip(): return ReviewResult("pass", [], [], 100)
response = self.client.messages.create( model="claude-sonnet-4-6", max_tokens=1024, system=REVIEW_SYSTEM, messages=[{ "role": "user", "content": f"Review this diff:\n```diff\n{diff[:3000]}\n```" }] )
import json, re text = response.content[0].text match = re.search(r"\{[\s\S]+\}", text) if not match: return ReviewResult("warn", ["리뷰 파싱 실패"], [], 50)
data = json.loads(match.group()) return ReviewResult( severity=data.get("severity", "warn"), issues=data.get("issues", []), suggestions=data.get("suggestions", []), score=data.get("score", 50) )
def get_diff(self) -> str: result = subprocess.run( ["git", "diff", "HEAD"], capture_output=True, text=True ) return result.stdout3. qa_agent.py — QA 에이전트 메인
from qa_runner import TestRunner, TestResultfrom code_reviewer import CodeReviewer, ReviewResultfrom dataclasses import dataclass
@dataclassclass QAReport: iteration: int test_result: TestResult review_result: ReviewResult verdict: str # "approve" | "request_changes" | "reject" feedback: str # 코더 에이전트에게 전달할 피드백
class QAAgent: MAX_REVIEW_RETRIES = 3
def __init__(self, test_dir: str = "tests/"): self.runner = TestRunner(test_dir) self.reviewer = CodeReviewer() self.history: list[QAReport] = []
def evaluate(self, iteration: int = 1) -> QAReport: print(f"[QA] 이터레이션 {iteration} — 테스트 실행 중...") test_result = self.runner.run(with_coverage=True) print(test_result.to_summary())
print("[QA] 코드 리뷰 중...") diff = self.reviewer.get_diff() review_result = self.reviewer.review_diff(diff)
verdict, feedback = self._decide(test_result, review_result) report = QAReport( iteration=iteration, test_result=test_result, review_result=review_result, verdict=verdict, feedback=feedback ) self.history.append(report) return report
def _decide( self, test: TestResult, review: ReviewResult ) -> tuple[str, str]: if not test.all_passed: feedback = ( f"테스트 {test.failed}개 실패:\n" + "\n".join(f"- {t}" for t in test.failed_tests[:5]) + f"\n\n테스트 출력:\n{test.full_output[:500]}" ) return "request_changes", feedback
if review.should_block(): feedback = ( f"코드 리뷰 차단 (점수: {review.score}/100):\n" + "\n".join(f"- {i}" for i in review.issues) ) return "reject", feedback
if review.severity == "warn": feedback = ( "테스트 통과, 경고 사항:\n" + "\n".join(f"- {i}" for i in review.issues) ) return "approve", feedback
coverage = test.coverage_pct or 0 if coverage < 70: return "request_changes", f"커버리지 {coverage:.1f}% — 최소 70% 필요"
return "approve", f"모든 테스트 통과. 커버리지: {coverage:.1f}%"4. end-to-end 파이프라인
from planner_agent import PlannerAgentfrom coder_agent import CoderAgentfrom qa_agent import QAAgent
def run_pipeline(objective: str, codebase_root: str = "."): MAX_CODER_RETRIES = 3
# 1단계: Planning print("=" * 50) print("STAGE 1: PLANNING") planner = PlannerAgent(codebase_root) plan = planner.plan(objective)
# 2단계: Coding + QA 피드백 루프 coder = CoderAgent() qa = QAAgent()
for attempt in range(1, MAX_CODER_RETRIES + 1): print(f"\n{'=' * 50}") print(f"STAGE 2: CODING (시도 {attempt}/{MAX_CODER_RETRIES})")
coder_input = {"plan": plan} if attempt > 1 and qa.history: last_feedback = qa.history[-1].feedback coder_input["feedback"] = last_feedback print(f"[Pipeline] QA 피드백 코더에 전달:\n{last_feedback[:200]}")
coder.run(coder_input)
print(f"\n{'=' * 50}") print(f"STAGE 3: QA (시도 {attempt})") report = qa.evaluate(iteration=attempt)
print(f"[Pipeline] QA 판정: {report.verdict}") if report.verdict == "approve": print("[Pipeline] 파이프라인 완료 — 승인") return report
print(f"[Pipeline] 최대 시도 횟수 초과 — 파이프라인 실패") return qa.history[-1] if qa.history else None
if __name__ == "__main__": result = run_pipeline( objective="divide() 함수에 ZeroDivisionError 처리를 추가하라.", codebase_root="." ) if result: print(f"\n최종 결과: {result.verdict}")qa_runner.py구현:pip install pytest pytest-json-report pytest-covcode_reviewer.py구현 및 단독 테스트:python -c "from code_reviewer import CodeReviewer; r = CodeReviewer(); print(r.review_diff('+ x = 1/0'))"qa_agent.py구현pipeline_e2e.py로 end-to-end 실행- QA가 코더에게 피드백을 보내는 사이클이 최소 1회 이상 발생하는 시나리오 확인
제출물
assignments/lab-09/[학번]/에 PR:
-
qa_runner.py— pytest 실행 및 커버리지 수집 -
code_reviewer.py— LLM 기반 diff 리뷰 -
qa_agent.py— 판정 로직 포함 완전한 QA 에이전트 -
pipeline_e2e.py— Planner→Coder→QA 3단계 파이프라인 -
qa_reports/— 실제 실행된 QA 보고서 JSON (최소 2회) -
README.md— end-to-end 실행 결과, 피드백 루프 동작 설명, 커버리지 수치