AutoGPT/benchmark/agbenchmark/app.py

import datetime
import glob
import json
import logging
import sys
import time
import uuid
from collections import deque
from multiprocessing import Process
from pathlib import Path
from typing import Optional

import httpx
import psutil
from agent_protocol_client import AgentApi, ApiClient, ApiException, Configuration
from agent_protocol_client.models import Task, TaskRequestBody
from fastapi import APIRouter, FastAPI, HTTPException, Request, Response
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel, Extra, ValidationError

from agbenchmark.challenges import ChallengeInfo
from agbenchmark.config import AgentBenchmarkConfig
from agbenchmark.reports.processing.report_types_v2 import (
    BenchmarkRun,
    Metrics,
    RepositoryInfo,
    RunDetails,
    TaskInfo,
)
from agbenchmark.schema import TaskEvalRequestBody
from agbenchmark.utils.utils import write_pretty_json

sys.path.append(str(Path(__file__).parent.parent))

logger = logging.getLogger(__name__)

CHALLENGES: dict[str, ChallengeInfo] = {}
challenges_path = Path(__file__).parent / "challenges"
challenge_spec_files = deque(
    glob.glob(
        f"{challenges_path}/**/data.json",
        recursive=True,
    )
)

logger.debug("Loading challenges...")
while challenge_spec_files:
    challenge_spec_file = Path(challenge_spec_files.popleft())
    challenge_relpath = challenge_spec_file.relative_to(challenges_path.parent)
    if challenge_relpath.is_relative_to("challenges/deprecated"):
        continue

    logger.debug(f"Loading {challenge_relpath}...")
    try:
        challenge_info = ChallengeInfo.parse_file(challenge_spec_file)
    except ValidationError as e:
        if logging.getLogger().level == logging.DEBUG:
            logger.warning(f"Spec file {challenge_relpath} failed to load:\n{e}")
        logger.debug(f"Invalid challenge spec: {challenge_spec_file.read_text()}")
        continue
    challenge_info.spec_file = challenge_spec_file

    if not challenge_info.eval_id:
        challenge_info.eval_id = str(uuid.uuid4())
        # this will sort all the keys of the JSON systematically
        # so that the order is always the same
        write_pretty_json(challenge_info.dict(), challenge_spec_file)

    CHALLENGES[challenge_info.eval_id] = challenge_info


class BenchmarkTaskInfo(BaseModel):
    task_id: str
    start_time: datetime.datetime
    challenge_info: ChallengeInfo


task_informations: dict[str, BenchmarkTaskInfo] = {}


def find_agbenchmark_without_uvicorn():
    pids = []
    for process in psutil.process_iter(
        attrs=[
            "pid",
            "cmdline",
            "name",
            "username",
            "status",
            "cpu_percent",
            "memory_info",
            "create_time",
            "cwd",
            "connections",
        ]
    ):
        try:
            # Convert the process.info dictionary values to strings and concatenate them
            full_info = " ".join([str(v) for k, v in process.as_dict().items()])

            if "agbenchmark" in full_info and "uvicorn" not in full_info:
                pids.append(process.pid)
        except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
            pass
    return pids


class CreateReportRequest(BaseModel):
    test: str = None
    test_run_id: str = None
    # category: Optional[str] = []
    mock: Optional[bool] = False

    class Config:
        extra = Extra.forbid  # this will forbid any extra fields


updates_list = []

origins = [
    "http://localhost:8000",
    "http://localhost:8080",
    "http://127.0.0.1:5000",
    "http://localhost:5000",
]


def stream_output(pipe):
    for line in pipe:
        print(line, end="")


def setup_fastapi_app(agbenchmark_config: AgentBenchmarkConfig) -> FastAPI:
    from agbenchmark.agent_api_interface import upload_artifacts
    from agbenchmark.challenges import get_challenge_from_source_uri
    from agbenchmark.main import run_benchmark

    configuration = Configuration(
        host=agbenchmark_config.host or "http://localhost:8000"
    )
    app = FastAPI()
    app.add_middleware(
        CORSMiddleware,
        allow_origins=origins,
        allow_credentials=True,
        allow_methods=["*"],
        allow_headers=["*"],
    )
    router = APIRouter()

    @router.post("/reports")
    def run_single_test(body: CreateReportRequest) -> dict:
        pids = find_agbenchmark_without_uvicorn()
        logger.info(f"pids already running with agbenchmark: {pids}")

        logger.debug(f"Request to /reports: {body.dict()}")

        # Start the benchmark in a separate thread
        benchmark_process = Process(
            target=lambda: run_benchmark(
                config=agbenchmark_config,
                tests=(body.test,),
                mock=body.mock or False,
            )
        )
        benchmark_process.start()

        # Wait for the benchmark to finish, with a timeout of 200 seconds
        timeout = 200
        start_time = time.time()
        while benchmark_process.is_alive():
            if time.time() - start_time > timeout:
                logger.warning(f"Benchmark run timed out after {timeout} seconds")
                benchmark_process.terminate()
                break
            time.sleep(1)
        else:
            logger.debug(f"Benchmark finished running in {time.time() - start_time} s")

        # List all folders in the current working directory
        path_reports = agbenchmark_config.reports_folder
        folders = [folder for folder in path_reports.iterdir() if folder.is_dir()]

        # Sort the folders based on their names
        sorted_folders = sorted(folders, key=lambda x: x.name)

        # Get the last folder
        latest_folder = sorted_folders[-1] if sorted_folders else None

        # Read report.json from this folder
        if latest_folder:
            report_path = latest_folder / "report.json"
            logger.debug(f"Getting latest report from {report_path}")
            if report_path.exists():
                with report_path.open() as file:
                    data = json.load(file)
                logger.debug(f"Report data: {data}")
            else:
                logger.error(
                    "Could not get result after running benchmark: "
                    f"'report.json' does not exist in '{latest_folder}'"
                )
        else:
            logger.error(
                "Could not get result after running benchmark: no reports found"
            )

        return data

    @router.post("/agent/tasks", tags=["agent"])
    async def create_agent_task(task_eval_request: TaskEvalRequestBody) -> Task:
        """
        Creates a new task using the provided TaskEvalRequestBody and returns a Task.

        Args:
            task_eval_request: `TaskRequestBody` including an eval_id.

        Returns:
            Task: A new task with task_id, input, additional_input,
                and empty lists for artifacts and steps.

        Example:
            Request (TaskEvalRequestBody defined in schema.py):
                {
                    ...,
                    "eval_id": "50da533e-3904-4401-8a07-c49adf88b5eb"
                }

            Response (Task defined in `agent_protocol_client.models`):
                {
                    "task_id": "50da533e-3904-4401-8a07-c49adf88b5eb",
                    "input": "Write the word 'Washington' to a .txt file",
                    "artifacts": []
                }
        """
        try:
            challenge_info = CHALLENGES[task_eval_request.eval_id]
            async with ApiClient(configuration) as api_client:
                api_instance = AgentApi(api_client)
                task_input = challenge_info.task

                task_request_body = TaskRequestBody(input=task_input)
                task_response = await api_instance.create_agent_task(
                    task_request_body=task_request_body
                )
                task_info = BenchmarkTaskInfo(
                    task_id=task_response.task_id,
                    start_time=datetime.datetime.now(datetime.timezone.utc),
                    challenge_info=challenge_info,
                )
                task_informations[task_info.task_id] = task_info

                if input_artifacts_dir := challenge_info.task_artifacts_dir:
                    await upload_artifacts(
                        api_instance,
                        input_artifacts_dir,
                        task_response.task_id,
                        "artifacts_in",
                    )
                return task_response
        except ApiException as e:
            logger.error(f"Error whilst trying to create a task:\n{e}")
            logger.error(
                "The above error was caused while processing request: "
                f"{task_eval_request}"
            )
            raise HTTPException(500)

    @router.post("/agent/tasks/{task_id}/steps")
    async def proxy(request: Request, task_id: str):
        timeout = httpx.Timeout(300.0, read=300.0)  # 5 minutes
        async with httpx.AsyncClient(timeout=timeout) as client:
            # Construct the new URL
            new_url = f"{configuration.host}/ap/v1/agent/tasks/{task_id}/steps"

            # Forward the request
            response = await client.post(
                new_url,
                data=await request.body(),
                headers=dict(request.headers),
            )

            # Return the response from the forwarded request
            return Response(content=response.content, status_code=response.status_code)

    @router.post("/agent/tasks/{task_id}/evaluations")
    async def create_evaluation(task_id: str) -> BenchmarkRun:
        task_info = task_informations[task_id]
        challenge = get_challenge_from_source_uri(task_info.challenge_info.source_uri)
        try:
            async with ApiClient(configuration) as api_client:
                api_instance = AgentApi(api_client)
                eval_results = await challenge.evaluate_task_state(
                    api_instance, task_id
                )

            eval_info = BenchmarkRun(
                repository_info=RepositoryInfo(),
                run_details=RunDetails(
                    command=f"agbenchmark --test={challenge.info.name}",
                    benchmark_start_time=(
                        task_info.start_time.strftime("%Y-%m-%dT%H:%M:%S+00:00")
                    ),
                    test_name=challenge.info.name,
                ),
                task_info=TaskInfo(
                    data_path=challenge.info.source_uri,
                    is_regression=None,
                    category=[c.value for c in challenge.info.category],
                    task=challenge.info.task,
                    answer=challenge.info.reference_answer or "",
                    description=challenge.info.description or "",
                ),
                metrics=Metrics(
                    success=all(e.passed for e in eval_results),
                    success_percentage=(
                        100 * sum(e.score for e in eval_results) / len(eval_results)
                        if eval_results  # avoid division by 0
                        else 0
                    ),
                    attempted=True,
                ),
                config={},
            )

            logger.debug(f"Returning evaluation data:\n{eval_info.json(indent=4)}")
            return eval_info
        except ApiException as e:
            logger.error(f"Error {e} whilst trying to evaluate task: {task_id}")
            raise HTTPException(500)

    app.include_router(router, prefix="/ap/v1")

    return app