# Mycroft Server - Backend # Copyright (C) 2019 Mycroft AI Inc # SPDX-License-Identifier: AGPL-3.0-or-later # # This file is part of the Mycroft Server. # # The Mycroft Server is free software: you can redistribute it and/or # modify it under the terms of the GNU Affero General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . import csv import datetime import json import uuid from collections import defaultdict import time from geopy.distance import distance from psycopg2 import connect from psycopg2.extras import execute_batch users = {} user_settings = {} subscription = {} devices = {} user_devices = {} skills = {} skill_sections = {} skill_fields = {} skill_field_values = {} device_to_skill = {} skill_to_section = {} section_to_field = {} device_to_field = {} locations = {} timezones = {} cities = {} regions = {} countries = {} coordinates = {} device_location = {} hey_mycroft = str(uuid.uuid4()) christopher = str(uuid.uuid4()) ezra = str(uuid.uuid4()) jarvis = str(uuid.uuid4()) default_wake_words = { "hey mycroft": hey_mycroft, "christopher": christopher, "hey ezra": ezra, "hey jarvis": jarvis, } def load_csv(): with open("users.csv") as user_csv: user_reader = csv.reader(user_csv) next(user_reader, None) for row in user_reader: # email, password users[row[0]] = {} users[row[0]]["email"] = row[1] users[row[0]]["password"] = row[2] users[row[0]]["terms"] = row[3] users[row[0]]["privacy"] = row[4] with open("user_settings.csv") as user_setting_csv: user_setting_reader = csv.reader(user_setting_csv) next(user_setting_reader, None) for row in user_setting_reader: user_settings[row[0]] = {} user_settings[row[0]]["date_format"] = row[1] user_settings[row[0]]["time_format"] = row[2] user_settings[row[0]]["measurement_system"] = row[3] user_settings[row[0]]["tts_type"] = row[4] user_settings[row[0]]["tts_voice"] = row[5] user_settings[row[0]]["wake_word"] = row[6] user_settings[row[0]]["sample_rate"] = row[7] user_settings[row[0]]["channels"] = row[8] user_settings[row[0]]["pronunciation"] = row[9] user_settings[row[0]]["threshold"] = row[10] user_settings[row[0]]["threshold_multiplier"] = row[11] user_settings[row[0]]["dynamic_energy_ratio"] = row[12] with open("subscription.csv") as subscription_csv: subscription_reader = csv.reader(subscription_csv) next(subscription_reader, None) for row in subscription_reader: subscription[row[0]] = {} subscription[row[0]]["stripe_customer_id"] = row[1] subscription[row[0]]["last_payment_ts"] = row[2] subscription[row[0]]["type"] = row[3] with open("devices.csv") as devices_csv: devices_reader = csv.reader(devices_csv) next(devices_reader, None) for row in devices_reader: devices[row[0]] = {} user_uuid = row[1] devices[row[0]]["user_uuid"] = row[1] devices[row[0]]["name"] = row[2] devices[row[0]]["description"] = (row[3],) devices[row[0]]["platform"] = (row[4],) devices[row[0]]["enclosure_version"] = row[5] devices[row[0]]["core_version"] = row[6] if user_uuid in user_devices: user_devices[user_uuid].append((row[0], row[2])) else: user_devices[user_uuid] = [(row[0], row[2])] with open("skill.csv") as skill_csv: skill_reader = csv.reader(skill_csv) next(skill_reader, None) for row in skill_reader: skill = row[0] skills[skill] = {} dev_uuid = row[1] skills[skill]["device_uuid"] = dev_uuid skills[skill]["name"] = row[2] skills[skill]["description"] = row[3] skills[skill]["identifier"] = row[4] if dev_uuid in device_to_skill: device_to_skill[dev_uuid].add(skill) else: device_to_skill[dev_uuid] = {skill} with open("skill_section.csv") as skill_section_csv: skill_section_reader = csv.reader(skill_section_csv) next(skill_section_reader, None) for row in skill_section_reader: section_uuid = row[0] skill_sections[section_uuid] = {} skill_uuid = row[1] skill_sections[section_uuid]["skill_uuid"] = skill_uuid skill_sections[section_uuid]["section"] = row[2] skill_sections[section_uuid]["display_order"] = row[3] if skill_uuid in skill_to_section: skill_to_section[skill_uuid].add(section_uuid) else: skill_to_section[skill_uuid] = {section_uuid} with open("skill_fields.csv") as skill_fields_csv: skill_fields_reader = csv.reader(skill_fields_csv) next(skill_fields_reader, None) for row in skill_fields_reader: field_uuid = row[0] skill_fields[field_uuid] = {} section_uuid = row[1] # skill_fields[field_uuid]['section_uuid'] = section_uuid skill_fields[field_uuid]["name"] = row[2] skill_fields[field_uuid]["type"] = row[3] skill_fields[field_uuid]["label"] = row[4] skill_fields[field_uuid]["hint"] = row[5] skill_fields[field_uuid]["placeholder"] = row[6] skill_fields[field_uuid]["hide"] = row[7] skill_fields[field_uuid]["options"] = row[8] # skill_fields[field_uuid]['order'] = row[9] if section_uuid in section_to_field: section_to_field[section_uuid].add(field_uuid) else: section_to_field[section_uuid] = {field_uuid} with open("skill_fields_values.csv") as skill_field_values_csv: skill_field_values_reader = csv.reader(skill_field_values_csv) next(skill_field_values_reader, None) for row in skill_field_values_reader: field_uuid = row[0] skill_field_values[field_uuid] = {} skill_field_values[field_uuid]["skill_uuid"] = row[1] device_uuid = row[2] skill_field_values[field_uuid]["device_uuid"] = device_uuid skill_field_values[field_uuid]["field_value"] = row[3] if device_uuid in device_to_field: device_to_field[device_uuid].add(field_uuid) else: device_to_field[device_uuid] = {field_uuid} with open("location.csv") as location_csv: location_reader = csv.reader(location_csv) next(location_reader, None) for row in location_reader: location_uuid = row[0] locations[location_uuid] = {} locations[location_uuid]["timezone"] = row[1] locations[location_uuid]["city"] = row[2] locations[location_uuid]["coordinate"] = row[3] with open("timezone.csv") as timezone_csv: timezone_reader = csv.reader(timezone_csv) next(timezone_reader, None) for row in timezone_reader: timezone_uuid = row[0] timezones[timezone_uuid] = {} timezones[timezone_uuid]["code"] = row[1] timezones[timezone_uuid]["name"] = row[2] with open("city.csv") as city_csv: city_reader = csv.reader(city_csv) next(city_reader, None) for row in city_reader: city_uuid = row[0] cities[city_uuid] = {} cities[city_uuid]["region"] = row[1] cities[city_uuid]["name"] = row[2] with open("region.csv") as region_csv: region_reader = csv.reader(region_csv) next(region_reader, None) for row in region_reader: region_uuid = row[0] regions[region_uuid] = {} regions[region_uuid]["country"] = row[1] regions[region_uuid]["name"] = row[2] regions[region_uuid]["code"] = row[3] with open("country.csv") as country_csv: country_reader = csv.reader(country_csv) next(country_reader, None) for row in country_reader: country_uuid = row[0] countries[country_uuid] = {} countries[country_uuid]["name"] = row[1] countries[country_uuid]["code"] = row[2] with open("coordinate.csv") as coordinate_csv: coordinate_reader = csv.reader(coordinate_csv) next(coordinate_reader, None) for row in coordinate_reader: coordinate_uuid = row[0] coordinates[coordinate_uuid] = {} coordinates[coordinate_uuid]["latitude"] = row[1] coordinates[coordinate_uuid]["longitude"] = row[2] with open("device_location.csv") as device_location_csv: device_location_reader = csv.reader(device_location_csv, None) next(device_location_reader, None) for row in device_location_reader: device_uuid = row[0] if device_uuid in devices: devices[device_uuid]["location"] = row[1] def format_date(value): value = int(value) value = datetime.datetime.fromtimestamp(value // 1000) return f"{value:%Y-%m-%d}" def format_timestamp(value): value = int(value) value = datetime.datetime.fromtimestamp(value // 1000) return f"{value:%Y-%m-%d %H:%M:%S}" db = connect(dbname="mycroft", user="postgres", host="127.0.0.1") db.autocommit = True subscription_uuids = {} def get_subscription_uuid(subs): if subs in subscription_uuids: return subscription_uuids[subs] else: cursor = db.cursor() cursor.execute( f"select id from account.membership s where s.rate_period = '{subs}'" ) result = cursor.fetchone() subscription_uuids[subs] = result return result tts_uuids = {} def get_tts_uuid(tts): if tts in tts_uuids: return tts_uuids[tts] else: cursor = db.cursor() cursor.execute( f"select id from device.text_to_speech s where s.setting_name = '{tts}'" ) result = cursor.fetchone() tts_uuids[tts] = result return result def fill_account_table(): query = ( "insert into account.account(" "id, " "email_address, " "password) " "values (%s, %s, %s)" ) with db.cursor() as cur: accounts = ( (uuid, account["email"], account["password"]) for uuid, account in users.items() ) execute_batch(cur, query, accounts, page_size=1000) def fill_account_agreement_table(): query = ( "insert into account.account_agreement(account_id, agreement_id, accept_date)" "values (%s, (select id from account.agreement where agreement = %s), %s)" ) with db.cursor() as cur: terms = ( (uuid, "Terms of Use", format_timestamp(account["terms"])) for uuid, account in users.items() if account["terms"] != "" ) privacy = ( (uuid, "Privacy Policy", format_timestamp(account["privacy"])) for uuid, account in users.items() if account["privacy"] != "" ) execute_batch(cur, query, terms, page_size=1000) execute_batch(cur, query, privacy, page_size=1000) def fill_default_wake_word(): query1 = ( "insert into device.wake_word (" "id," "setting_name," "display_name," "engine)" "values (%s, %s, %s, %s)" ) query2 = ( "insert into device.wake_word_settings(" "wake_word_id," "sample_rate," "channels," "pronunciation," "threshold," "threshold_multiplier," "dynamic_energy_ratio)" "values (%s, %s, %s, %s, %s, %s, %s)" ) wake_words = [ (hey_mycroft, "Hey Mycroft", "Hey Mycroft", "precise"), (christopher, "Christopher", "Christopher", "precise"), (ezra, "Hey Ezra", "Hey Ezra", "precise"), (jarvis, "Hey Jarvis", "Hey Jarvis", "precise"), ] wake_word_settings = [ (hey_mycroft, "16000", "1", "HH EY . M AY K R AO F T", "1e-90", "1", "1.5"), (christopher, "16000", "1", "K R IH S T AH F ER .", "1e-25", "1", "1.5"), (ezra, "16000", "1", "HH EY . EH Z R AH", "1e-10", "1", "2.5"), (jarvis, "16000", "1", "HH EY . JH AA R V AH S", "1e-25", "1", "1.5"), ] with db.cursor() as cur: execute_batch(cur, query1, wake_words) execute_batch(cur, query2, wake_word_settings) def fill_wake_word_table(): query = ( "insert into device.wake_word (" "id," "setting_name," "display_name," "engine," "account_id)" "values (%s, %s, %s, %s, %s)" ) def map_wake_word(user_id): wake_word_id = str(uuid.uuid4()) wake_word = ( user_settings[user_id]["wake_word"].lower() if user_id in user_settings else "hey mycroft" ) mycroft_wake_word = default_wake_words.get(wake_word) if mycroft_wake_word is not None: wake_word_id = mycroft_wake_word users[user_id]["wake_word_id"] = wake_word_id return wake_word_id, wake_word, wake_word, "precise", user_id with db.cursor() as cur: wake_words = (map_wake_word(account_id) for account_id in users) wake_words = ( wk for wk in wake_words if wk[0] not in (hey_mycroft, christopher, ezra, jarvis) ) execute_batch(cur, query, wake_words, page_size=1000) def fill_account_preferences_table(): query = ( "insert into device.account_preferences(" "account_id, " "date_format, " "time_format, " "measurement_system)" "values (%s, %s, %s, %s)" ) def map_account_preferences(user_uuid): if user_uuid in user_settings: user_setting = user_settings[user_uuid] date_format = user_setting["date_format"] if date_format == "DMY": date_format = "DD/MM/YYYY" else: date_format = "MM/DD/YYYY" time_format = user_setting["time_format"] if time_format == "full": time_format = "24 Hour" else: time_format = "12 Hour" measurement_system = user_setting["measurement_system"] if measurement_system == "metric": measurement_system = "Metric" elif measurement_system == "imperial": measurement_system = "Imperial" tts_type = user_setting["tts_type"] tts_voice = user_setting["tts_voice"] if tts_type == "MimicSetting": if tts_voice == "ap": tts = "ap" elif tts_voice == "trinity": tts = "amy" else: tts = "ap" elif tts_type == "Mimic2Setting": tts = "kusal" elif tts_type == "GoogleTTSSetting": tts = "google" else: tts = "ap" text_to_speech_id = get_tts_uuid(tts) users[user_uuid]["text_to_speech_id"] = text_to_speech_id return user_uuid, date_format, time_format, measurement_system else: text_to_speech_id = get_tts_uuid("ap") users[user_uuid]["text_to_speech_id"] = text_to_speech_id return user_uuid, "MM/DD/YYYY", "12 Hour", "Imperial" with db.cursor() as cur: account_preferences = ( map_account_preferences(user_uuid) for user_uuid in users ) execute_batch(cur, query, account_preferences, page_size=1000) def fill_subscription_table(): query = ( "insert into account.account_membership(" "account_id, " "membership_id, " "membership_ts_range, " "payment_account_id," "payment_method," "payment_id) " "values (%s, %s, %s, %s, %s, %s)" ) def map_subscription(user_uuid): subscr = subscription[user_uuid] stripe_customer_id = subscr["stripe_customer_id"] start = format_timestamp(subscr["last_payment_ts"]) subscription_ts_range = "[{},)".format(start) subscription_type = subscr["type"] if subscription_type == "MonthlyAccount": subscription_type = "month" elif subscription_type == "YearlyAccount": subscription_type = "year" subscription_uuid = get_subscription_uuid(subscription_type) return ( user_uuid, subscription_uuid, subscription_ts_range, stripe_customer_id, "Stripe", "subscription_id", ) with db.cursor() as cur: account_subscriptions = ( map_subscription(user_uuid) for user_uuid in subscription ) execute_batch(cur, query, account_subscriptions, page_size=1000) def fill_wake_word_settings_table(): query = ( "insert into device.wake_word_settings(" "wake_word_id," "sample_rate," "channels," "pronunciation," "threshold," "threshold_multiplier," "dynamic_energy_ratio)" "values (%s, %s, %s, %s, %s, %s, %s)" ) def map_wake_word_settings(user_uuid): user_setting = user_settings[user_uuid] wake_word_id = users[user_uuid]["wake_word_id"] sample_rate = user_setting["sample_rate"] channels = user_setting["channels"] pronunciation = user_setting["pronunciation"] threshold = user_setting["threshold"] threshold_multiplier = user_setting["threshold_multiplier"] dynamic_energy_ratio = user_setting["dynamic_energy_ratio"] return ( wake_word_id, sample_rate, channels, pronunciation, threshold, threshold_multiplier, dynamic_energy_ratio, ) with db.cursor() as cur: account_wake_word_settings = ( map_wake_word_settings(user_uuid) for user_uuid in users if user_uuid in user_settings ) account_wake_word_settings = ( wks for wks in account_wake_word_settings if wks[0] not in (hey_mycroft, christopher, ezra, jarvis) ) execute_batch(cur, query, account_wake_word_settings, page_size=1000) def change_device_name(): for user in user_devices: if user in users: device_names = defaultdict(list) for device_uuid, name in user_devices[user]: device_names[name].append(device_uuid) for name in device_names: uuids = device_names[name] if len(uuids) > 1: count = 1 for uuid in uuids: devices[uuid]["name"] = "{name}-{uuid}".format( name=name, uuid=uuid ) count += 1 def fill_device_table(): query = ( "insert into device.device(" "id, " "account_id, " "name, " "placement," "platform," "enclosure_version," "core_version," "wake_word_id," "geography_id," "text_to_speech_id) " "values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" ) query2 = ( "insert into device.geography(" "id," "account_id," "country_id," "region_id," "city_id," "timezone_id) " "values (%s, %s, %s, %s, %s, %s)" ) with db.cursor() as cur: query_geography = """ SELECT city.id, region.id, country.id, timezone.id FROM geography.city city INNER JOIN geography.region region ON city.region_id = region.id INNER JOIN geography.country country ON region.country_id = country.id INNER JOIN geography.timezone timezone ON country.id = timezone.country_id WHERE city.name = %s and region.name = %s and timezone.name = %s; """ cur.execute(query_geography, ("Lawrence", "Kansas", "America/Chicago")) city_default, region_default, country_default, timezone_default = cur.fetchone() def map_geography(account_id, device_id): geography_id = str(uuid.uuid4()) with db.cursor() as cur: query = """ SELECT city.id, region.id, country.id, timezone.id FROM geography.city city INNER JOIN geography.region region ON city.region_id = region.id INNER JOIN geography.country country ON region.country_id = country.id INNER JOIN geography.timezone timezone ON country.id = timezone.country_id WHERE city.name = %s and region.name = %s and timezone.name = %s and country.name = %s; """ location_uuid = devices[device_id].get("location") if location_uuid is not None: location = locations[location_uuid] timezone_entity = timezones[location["timezone"]] timezone = timezone_entity["code"] city_entity = cities[location["city"]] city = city_entity["name"] region_entity = regions[city_entity["region"]] region = region_entity["name"] country_entity = countries[region_entity["country"]] country = country_entity["name"] cur.execute(query, (city, region, timezone, country)) result = cur.fetchone() if result is not None: city, region, country, timezone = result return geography_id, account_id, country, region, city, timezone return ( geography_id, account_id, country_default, region_default, city_default, timezone_default, ) def map_device(device_id): device = devices[device_id] account_id = device["user_uuid"] name = device["name"] placement = device["description"] platform = device["platform"] enclosure_version = device["enclosure_version"] core_version = device["core_version"] wake_word_id = users[account_id]["wake_word_id"] geography_id = device["geography_id"] user_setting = user_settings[account_id] tts_type = user_setting["tts_type"] tts_voice = user_setting["tts_voice"] if tts_type == "MimicSetting": if tts_voice == "ap": tts = "ap" elif tts_voice == "trinity": tts = "amy" else: tts = "ap" elif tts_type == "Mimic2Setting": tts = "kusal" elif tts_type == "GoogleTTSSetting": tts = "google" else: tts = "ap" text_to_speech_id = get_tts_uuid(tts) return ( device_id, account_id, name, placement, platform, enclosure_version, core_version, wake_word_id, geography_id, text_to_speech_id, ) with db.cursor() as cur: geography_batch = [] for user in user_devices: if user in users and user in user_settings: aux = user_devices[user] device_id, _ = aux[0] geography = map_geography(user, device_id) geography_batch.append(geography) for device_id, name in aux: devices[device_id]["geography_id"] = geography[0] execute_batch(cur, query2, geography_batch, page_size=1000) devices_batch = ( map_device(device_id) for user in user_devices if user in users and user in user_settings for device_id, name in user_devices[user] ) execute_batch(cur, query, devices_batch, page_size=1000) def fill_skills_table(): skills_batch = [] settings_display_batch = [] device_skill_batch = [] for user in user_devices: if user in users and user in user_settings: for device_uuid, name in user_devices[user]: if device_uuid in device_to_skill: for skill_uuid in device_to_skill[device_uuid]: skill = skills[skill_uuid] skill_name = skill["name"] identifier = skill["identifier"] sections = [] settings = {} if skill_uuid in skill_to_section: for section_uuid in skill_to_section[skill_uuid]: section = skill_sections[section_uuid] section_name = section["section"] fields = [] if section_uuid in section_to_field: for field_uuid in section_to_field[section_uuid]: fields.append(skill_fields[field_uuid]) if field_uuid in skill_field_values: settings[ skill_fields[field_uuid]["name"] ] = skill_field_values[field_uuid][ "field_value" ] sections.append( {"name": section_name, "fields": fields} ) skill_setting_display = { "name": skill_name, "identifier": identifier, "skillMetadata": {"sections": sections}, } skills_batch.append((skill_uuid, skill_name)) meta_id = str(uuid.uuid4()) settings_display_batch.append( (meta_id, skill_uuid, json.dumps(skill_setting_display)) ) device_skill_batch.append( (device_uuid, skill_uuid, meta_id, json.dumps(settings)) ) with db.cursor() as curr: query = "insert into skill.skill(id, name) values (%s, %s)" execute_batch(curr, query, skills_batch, page_size=1000) query = "insert into skill.settings_display(id, skill_id, settings_display) values (%s, %s, %s)" execute_batch(curr, query, settings_display_batch, page_size=1000) query = ( "insert into device.device_skill(device_id, skill_id, skill_settings_display_id, settings) " "values (%s, %s, %s, %s)" ) execute_batch(curr, query, device_skill_batch, page_size=1000) def analyze_locations(): matches = 0 mismatches = 0 g_mismatches = defaultdict(lambda: defaultdict(list)) for city in cities.values(): region = regions[city["region"]] country = countries[region["country"]] city_name = city["name"] region_name = region["name"] country_name = country["name"] remove = [ "District", "Region", "Development", "Prefecture", "Community", "County", "Province", "Division", "Voivodeship", "State", "of", "Governorate", ] with db.cursor() as curr: original_region_name = region_name region_name = " ".join(i for i in region_name.split() if i not in remove) query = ( "select city.name " "from geography.city city " "inner join geography.region region on city.region_id = region.id " "inner join geography.country country on region.country_id = country.id " "where " "city.name = '{}' and " "(region.name = '{}' or region.name = '{}') and " "country.name = '{}'".format( city_name.replace("'", "''"), original_region_name.replace("'", "''"), region_name.replace("'", "''"), country_name.replace("'", "''"), ) ) curr.execute(query) result = curr.fetchone() if result is None: mismatches += 1 g_mismatches[country_name][region_name].append(city_name) else: matches += 1 for country2, regions2 in g_mismatches.items(): for region2, cities2 in regions2.items(): for city2 in cities2: print("{} - {} - {}".format(country2, region2, city2)) print("Number os mismatches: {}".format(mismatches)) def analyze_location_2(): aux = defaultdict(lambda: defaultdict(lambda: defaultdict(str))) locations_from_db = defaultdict(list) with db.cursor() as cur: cur.execute( "select " "c1.id, " "c1.name, " "c1.latitude, " "c1.longitude, " "r.name, " "c2.name, " "c2.iso_code " "from geography.city c1 " "inner join geography.region r on c1.region_id = r.id " "inner join geography.country c2 on r.country_id = c2.id" ) for c1_id, c1, latitude, longitude, r, c2_name, c2_code in cur: aux[c2_name][r][c1] = c1_id locations_from_db[c2_code].append((c1, latitude, longitude)) for location_uuid, location in locations.items(): coordinate = coordinates[location_uuid] city = cities[location["city"]] city_name = city["name"] region = regions[city["region"]] region_name = region["name"] country = countries[region["country"]] country_code = country["code"] country_name = country["name"] res = aux.get(country_name) if res is not None: res = res.get(region_name) if res is not None: res = res.get(city_name) if res is not None: print("Match: {}".format(city_name)) continue min_dist = None result_name = None for c1_name, latitude, longitude in locations_from_db[country_code]: point1 = (float(latitude), float(longitude)) point2 = (float(coordinate["latitude"]), float(coordinate["longitude"])) dist = distance(point1, point2).km if min_dist is None or dist < min_dist: min_dist = dist result_name = c1_name print("Actual: {}, calculated: {}".format(city_name, result_name)) start = time.time() load_csv() end = time.time() print("Time to load CSVs {}".format(end - start)) start = time.time() print("Importing account table") # fill_account_table() print("Importing agreements table") # fill_account_agreement_table() print("Importing account preferences table") # fill_account_preferences_table() print("Importing subscription table") # fill_subscription_table() print("Importing wake word table") # fill_default_wake_word() # fill_wake_word_table() print("Importing wake word settings table") # fill_wake_word_settings_table() print("Importing device table") # change_device_name() # fill_device_table() print("Importing skills table") # fill_skills_table() analyze_location_2() end = time.time() print("Time to import: {}".format(end - start))