selene-backend/db/scripts/bootstrap_mycroft_db.py

385 lines
11 KiB
Python

from glob import glob
from os import environ, path, remove
from markdown import markdown
from psycopg2 import connect
MYCROFT_DB_DIR = path.join(path.abspath('..'), 'mycroft')
SCHEMAS = ('account', 'skill', 'device', 'geography', 'metrics')
DB_DESTROY_FILES = (
'drop_mycroft_db.sql',
'drop_template_db.sql',
)
DB_CREATE_FILES = (
'create_template_db.sql',
)
ACCOUNT_TABLE_ORDER = (
'account',
'agreement',
'account_agreement',
'membership',
'account_membership',
)
SKILL_TABLE_ORDER = (
'skill',
'settings_display',
'display',
'oauth_credential',
'oauth_token'
)
DEVICE_TABLE_ORDER = (
'category',
'geography',
'text_to_speech',
'wake_word',
'wake_word_settings',
'account_preferences',
'account_defaults',
'device',
'device_skill',
)
GEOGRAPHY_TABLE_ORDER = (
'country',
'timezone',
'region',
'city'
)
METRICS_TABLE_ORDER = (
'api',
'job'
)
schema_directory = '{}_schema'
def get_sql_from_file(file_path: str) -> str:
with open(path.join(MYCROFT_DB_DIR, file_path)) as sql_file:
sql = sql_file.read()
return sql
class PostgresDB(object):
def __init__(self, db_name, user=None):
db_host = environ['DB_HOST']
db_port = environ['DB_PORT']
db_ssl_mode = environ.get('DB_SSL_MODE')
if db_name in ('postgres', 'defaultdb'):
db_user = environ['POSTGRES_DB_USER']
db_password = environ.get('POSTGRES_DB_PASSWORD')
else:
db_user = environ['MYCROFT_DB_USER']
db_password = environ['MYCROFT_DB_PASSWORD']
if user is not None:
db_user = user
self.db = connect(
dbname=db_name,
user=db_user,
password=db_password,
host=db_host,
port=db_port,
sslmode=db_ssl_mode
)
self.db.autocommit = True
def close_db(self):
self.db.close()
def execute_sql(self, sql: str, args=None):
cursor = self.db.cursor()
cursor.execute(sql, args)
return cursor
postgres_db = PostgresDB(db_name=environ['POSTGRES_DB_NAME'])
print('Destroying any objects we will be creating later.')
for db_destroy_file in DB_DESTROY_FILES:
postgres_db.execute_sql(
get_sql_from_file(db_destroy_file)
)
print('Creating the mycroft database')
for db_setup_file in DB_CREATE_FILES:
postgres_db.execute_sql(
get_sql_from_file(db_setup_file)
)
postgres_db.close_db()
template_db = PostgresDB(db_name='mycroft_template')
print('Creating the extensions')
template_db.execute_sql(
get_sql_from_file(path.join('create_extensions.sql'))
)
print('Creating user-defined data types')
type_directory = path.join(MYCROFT_DB_DIR, 'types')
for type_file in glob(type_directory + '/*.sql'):
template_db.execute_sql(
get_sql_from_file(path.join(type_directory, type_file))
)
print('Create the schemas and grant access')
for schema in SCHEMAS:
template_db.execute_sql(
get_sql_from_file(schema + '_schema/create_schema.sql')
)
print('Creating the account schema tables')
# These are created first as other schemas have tables with
# foreign keys to these tables.
for table in ACCOUNT_TABLE_ORDER:
create_table_file = path.join(
'account_schema',
'tables',
table + '.sql'
)
template_db.execute_sql(
get_sql_from_file(create_table_file)
)
print('Creating the skill schema tables')
# Create the skill schema tables second as other schemas have tables with
# foreign keys to these tables.
for table in SKILL_TABLE_ORDER:
create_table_file = path.join(
'skill_schema',
'tables',
table + '.sql'
)
template_db.execute_sql(
get_sql_from_file(create_table_file)
)
print('Creating the geography schema tables')
for table in GEOGRAPHY_TABLE_ORDER:
create_table_file = path.join(
'geography_schema',
'tables',
table + '.sql'
)
template_db.execute_sql(
get_sql_from_file(create_table_file)
)
print('Creating the device schema tables')
for table in DEVICE_TABLE_ORDER:
create_table_file = path.join(
'device_schema',
'tables',
table + '.sql'
)
template_db.execute_sql(
get_sql_from_file(create_table_file)
)
print('Creating the metrics schema tables')
for table in METRICS_TABLE_ORDER:
create_table_file = path.join(
'metrics_schema',
'tables',
table + '.sql'
)
template_db.execute_sql(
get_sql_from_file(create_table_file)
)
print('Granting access to schemas and tables')
for schema in SCHEMAS:
template_db.execute_sql(
get_sql_from_file(schema + '_schema/grants.sql')
)
template_db.close_db()
print('Copying template to new database.')
postgres_db = PostgresDB(db_name=environ['POSTGRES_DB_NAME'])
postgres_db.execute_sql(get_sql_from_file('create_mycroft_db.sql'))
postgres_db.close_db()
mycroft_db = PostgresDB(db_name=environ['MYCROFT_DB_NAME'])
insert_files = [
dict(schema_dir='account_schema', file_name='membership.sql'),
dict(schema_dir='device_schema', file_name='text_to_speech.sql'),
]
for insert_file in insert_files:
insert_file_path = path.join(
insert_file['schema_dir'],
'data',
insert_file['file_name']
)
try:
mycroft_db.execute_sql(
get_sql_from_file(insert_file_path)
)
except FileNotFoundError:
pass
print('Building account.agreement table')
mycroft_db.db.autocommit = False
insert_sql = (
"insert into account.agreement VALUES (default, '{}', '1', '[today,]', {})"
)
doc_dir = '/Users/chrisveilleux/Mycroft/github/documentation/_pages/'
docs = {
'Privacy Policy': doc_dir + 'embed-privacy-policy.md',
'Terms of Use': doc_dir + 'embed-terms-of-use.md'
}
try:
for agrmt_type, doc_path in docs.items():
lobj = mycroft_db.db.lobject(0, 'b')
with open(doc_path) as doc:
header_delimiter_count = 0
while True:
rec = doc.readline()
if rec == '---\n':
header_delimiter_count += 1
if header_delimiter_count == 2:
break
doc_html = markdown(
doc.read(),
output_format='html5'
)
lobj.write(doc_html)
mycroft_db.execute_sql(
insert_sql.format(agrmt_type, lobj.oid)
)
mycroft_db.execute_sql(
"grant select on large object {} to selene".format(lobj.oid)
)
mycroft_db.execute_sql(
insert_sql.format('Open Dataset', 'null')
)
except:
mycroft_db.db.rollback()
raise
else:
mycroft_db.db.commit()
mycroft_db.db.autocommit = True
reference_file_dir = '/Users/chrisveilleux/Mycroft'
print('Building geography.country table')
country_file = 'country.txt'
country_insert = """
INSERT INTO
geography.country (iso_code, name)
VALUES
('{iso_code}', '{country_name}')
"""
with open(path.join(reference_file_dir, country_file)) as countries:
while True:
rec = countries.readline()
if rec.startswith('#ISO'):
break
for country in countries.readlines():
country_fields = country.split('\t')
insert_args = dict(
iso_code=country_fields[0],
country_name=country_fields[4]
)
mycroft_db.execute_sql(country_insert.format(**insert_args))
print('Building geography.region table')
region_file = 'regions.txt'
region_insert = """
INSERT INTO
geography.region (country_id, region_code, name)
VALUES
(
(SELECT id FROM geography.country WHERE iso_code = %(iso_code)s),
%(region_code)s,
%(region_name)s)
"""
with open(path.join(reference_file_dir, region_file)) as regions:
for region in regions.readlines():
region_fields = region.split('\t')
country_iso_code = region_fields[0][:2]
insert_args = dict(
iso_code=country_iso_code,
region_code=region_fields[0],
region_name=region_fields[1]
)
mycroft_db.execute_sql(region_insert, insert_args)
print('Building geography.timezone table')
timezone_file = 'timezones.txt'
timezone_insert = """
INSERT INTO
geography.timezone (country_id, name, gmt_offset, dst_offset)
VALUES
(
(SELECT id FROM geography.country WHERE iso_code = %(iso_code)s),
%(timezone_name)s,
%(gmt_offset)s,
%(dst_offset)s
)
"""
with open(path.join(reference_file_dir, timezone_file)) as timezones:
timezones.readline()
for timezone in timezones.readlines():
timezone_fields = timezone.split('\t')
insert_args = dict(
iso_code=timezone_fields[0],
timezone_name=timezone_fields[1],
gmt_offset=timezone_fields[2],
dst_offset=timezone_fields[3]
)
mycroft_db.execute_sql(timezone_insert, insert_args)
print('Building geography.city table')
cities_file = 'cities500.txt'
region_query = "SELECT id, region_code FROM geography.region"
query_result = mycroft_db.execute_sql(region_query)
region_lookup = dict()
for row in query_result.fetchall():
region_lookup[row[1]] = row[0]
timezone_query = "SELECT id, name FROM geography.timezone"
query_result = mycroft_db.execute_sql(timezone_query)
timezone_lookup = dict()
for row in query_result.fetchall():
timezone_lookup[row[1]] = row[0]
# city_insert = """
# INSERT INTO
# geography.city (region_id, timezone_id, name, latitude, longitude)
# VALUES
# (%(region_id)s, %(timezone_id)s, %(city_name)s, %(latitude)s, %(longitude)s)
# """
with open(path.join(reference_file_dir, cities_file)) as cities:
with open(path.join(reference_file_dir, 'city.dump'), 'w') as dump_file:
for city in cities.readlines():
city_fields = city.split('\t')
city_region = city_fields[8] + '.' + city_fields[10]
region_id = region_lookup.get(city_region)
timezone_id = timezone_lookup[city_fields[17]]
if region_id is not None:
dump_file.write('\t'.join([
region_id,
timezone_id,
city_fields[1],
city_fields[4],
city_fields[5]
]) + '\n')
# mycroft_db.execute_sql(city_insert, insert_args)
with open(path.join(reference_file_dir, 'city.dump')) as dump_file:
cursor = mycroft_db.db.cursor()
cursor.copy_from(dump_file, 'geography.city', columns=(
'region_id', 'timezone_id', 'name', 'latitude', 'longitude')
)
remove(path.join(reference_file_dir, 'city.dump'))
mycroft_db.close_db()