Added script to load data into a df (#348)

Co-authored-by: SwiftyOS <craigswift13@gmail.com>
2023-09-01 10:31:04 -07:00 · 2023-09-01 10:31:04 -07:00 · abed1ae879
parent 819764372e
commit abed1ae879
1 changed files with 181 additions and 0 deletions
--- a/match_records.py
+++ b/match_records.py
@ -0,0 +1,181 @@
 import os
 import json
 import pandas as pd
 import glob
 from gql.transport.aiohttp import AIOHTTPTransport
 from gql import gql, Client
 import os
 def get_reports():
    # Initialize an empty list to store the report data
    report_data = []
    # Specify the path to the reports directory
    reports_dir = 'reports'
    # Iterate over all agent directories in the reports directory
    for agent_name in os.listdir(reports_dir):
        agent_dir = os.path.join(reports_dir, agent_name)
        # Check if the item is a directory (an agent directory)
        if os.path.isdir(agent_dir):
            # Construct the path to the report.json file
            # Use glob to find all run directories in the agent_dir
            run_dirs = glob.glob(os.path.join(agent_dir, '*'))
            # For each run directory, add the report.json to the end
            report_files = [os.path.join(run_dir, 'report.json') for run_dir in run_dirs]
            for report_file in report_files:
                # Check if the report.json file exists
                if os.path.isfile(report_file):
                    # Open the report.json file
                    with open(report_file, 'r') as f:
                        # Load the JSON data from the file
                        report = json.load(f)
                        # Iterate over all tests in the report
                        for test_name, test_data in report['tests'].items():
                            try:
                                # Append the relevant data to the report_data list
                                if agent_name is not None:
                                    report_data.append({
                                        'agent': agent_name.lower(),
                                        'benchmark_start_time': report['benchmark_start_time'],
                                        'challenge': test_name,
                                        'categories': ', '.join(test_data['category']),
                                        'task': test_data['task'],
                                        'success': test_data['metrics']['success'],
                                        'difficulty': test_data['metrics']['difficulty'],
                                        'success_%': test_data['metrics']['success_%'],
                                        'run_time': test_data['metrics']['run_time']
                                    })
                            except KeyError:
                                pass
    return pd.DataFrame(report_data)
 def get_helicone_data():
    helicone_api_key = os.getenv('HELICONE_API_KEY')
    url = "https://www.helicone.ai/api/graphql"
    # Replace <KEY> with your personal access key
    transport = AIOHTTPTransport(url=url, headers={
        "authorization": f"Bearer {helicone_api_key}"
    })
    client = Client(transport=transport, fetch_schema_from_transport=True)
    SIZE = 250
    i = 0
    data = []
    print("Fetching data from Helicone")
    while True:
        query = gql(
            """
            query ExampleQuery($limit: Int, $offset: Int){
                heliconeRequest(
                    limit: $limit
                    offset: $offset
                ) {
                    prompt
                    properties{
                        name
                        value
                    }
                    requestBody
                    response
                    createdAt
                }
                }
        """
        )
        print(f"Fetching {i * SIZE} to {(i + 1) * SIZE} records")    
        try:
            result = client.execute(query,
                                    variable_values={
                                        "limit": SIZE,
                                        "offset": i * SIZE
                                    }
                                    )
        except Exception as e:
            print(f"Error occurred: {e}")
            result = None
        i += 1
        if result:
            for item in result["heliconeRequest"]:
                properties = {prop['name']: prop['value'] for prop in item['properties']}
                data.append({
                    'createdAt': item['createdAt'],
                    'agent': properties.get('agent'),
                    'job_id': properties.get('job_id'),
                    'challenge': properties.get('challenge'),
                    'benchmark_start_time': properties.get('benchmark_start_time'),
                    'prompt': item['prompt'],
                    'model': item['requestBody'].get('model'),
                    'request': item['requestBody'].get('messages'),
                })
        if not result or (len(result["heliconeRequest"]) == 0):
            print("No more results")
            break
    df = pd.DataFrame(data)
    # Drop rows where agent is None
    df = df.dropna(subset=['agent'])
    # Convert the remaining agent names to lowercase
    df['agent'] = df['agent'].str.lower()
    return df
 if os.path.exists('reports_raw.pkl') and os.path.exists('helicone_raw.pkl'):
    reports_df = pd.read_pickle('reports_raw.pkl')
    helicone_df = pd.read_pickle('helicone_raw.pkl')
 else:
    reports_df = get_reports()
    reports_df.to_pickle('reports_raw.pkl')
    helicone_df = get_helicone_data()
    helicone_df.to_pickle('helicone_raw.pkl')
 def try_formats(date_str):
    formats = ['%Y-%m-%d-%H:%M', '%Y-%m-%dT%H:%M:%S%z']
    for fmt in formats:
        try:
            return pd.to_datetime(date_str, format=fmt)
        except ValueError:
            pass
    return None
 helicone_df['benchmark_start_time'] = pd.to_datetime(helicone_df['benchmark_start_time'].apply(try_formats), utc=True)
 helicone_df = helicone_df.dropna(subset=['benchmark_start_time'])
 helicone_df['createdAt'] = pd.to_datetime(helicone_df['createdAt'], unit='ms', origin='unix')
 reports_df['benchmark_start_time'] = pd.to_datetime(reports_df['benchmark_start_time'].apply(try_formats), utc=True)
 reports_df = reports_df.dropna(subset=['benchmark_start_time'])
 assert pd.api.types.is_datetime64_any_dtype(helicone_df['benchmark_start_time']), "benchmark_start_time in helicone_df is not datetime"
 assert pd.api.types.is_datetime64_any_dtype(reports_df['benchmark_start_time']), "benchmark_start_time in reports_df is not datetime"
 reports_df['report_time'] = reports_df['benchmark_start_time']
 df = pd.merge_asof(helicone_df.sort_values('benchmark_start_time'), 
                          reports_df.sort_values('benchmark_start_time'), 
                          left_on='benchmark_start_time', 
                          right_on='benchmark_start_time', 
                          by=['agent', 'challenge'], 
                          direction='backward')
 df.to_pickle('df.pkl')
 print(df.info())
 print("Data saved to df.pkl")
 print("To load the data use: df = pd.read_pickle('df.pkl')")