Change database to simple text file

2018-03-16 02:06:00 -05:00 · 2018-03-16 02:06:00 -05:00 · 0fe1bceede
parent 09e8b91ad2
commit 0fe1bceede
3 changed files with 20 additions and 17 deletions
--- a/precise/train_data.py
+++ b/precise/train_data.py
@ -50,19 +50,18 @@ class TrainData:
    @classmethod
    def from_db(cls, db_file: str, db_folder: str) -> 'TrainData':
        """
-        Load a set of data from an SQLite database in the following format:
-            Column: "final_tag"
-            Value: "wake-word" or "not-wake-word"
+        Load a set of data from a text database in the following format:
+            <file_id>  (tab)  <tag>
+            <file_id>  (tab)  <tag>

-            Column: "data_id"
-            Value: identifier of file such that the following
-                   file exists: {db_folder}/{data_id}.wav
+            file_id: identifier of file such that the following
+                     file exists: {db_folder}/{data_id}.wav
+            tag: "wake-word" or "not-wake-word"
        """
        if not db_file:
            return cls(([], []), ([], []))
        if not isfile(db_file):
            raise RuntimeError('Database file does not exist: ' + db_file)
-        import dataset

        train_groups = {}
        train_group_file = db_file.replace('db', '') + 'groups.json'
@ -70,14 +69,19 @@ class TrainData:
            with open(train_group_file) as f:
                train_groups = json.load(f)

-        db = dataset.connect('sqlite:///' + db_file)
-        files = [
-            [join(db_folder, i['data_id'] + '.wav') for i in db['data'].find(final_tag=tag)]
-            for tag in ['wake-word', 'not-wake-word']
-        ]
+        db_files = {
+            'wake-word': [],
+            'not-wake-word': []
+        }
+        with open(db_file) as f:
+            for line in f.read().split('\n'):
+                if not line:
+                    continue
+                file, tag = line.split('\t')
+                db_files[tag.strip()].append(file.strip())

        train_files, test_files = ([], []), ([], [])
-        for label, rows in enumerate(files):
+        for label, rows in enumerate([db_files['wake-word'], db_files['not-wake-word']]):
            for fn in rows:
                if not isfile(fn):
                    continue
@ -135,7 +139,9 @@ class TrainData:
    def parse_args(parser: ArgumentParser) -> Any:
        """Return parsed args from parser, adding options for train data inputs"""
        parser.add_argument('db_folder', help='Folder to load database references from')
-        parser.add_argument('-db', '--db-file', default='', help='Database file to use')
+        parser.add_argument(
+            '-db', '--db-file', default='', help='Text database to load from where '
+                                                 'each line is <file_id>\t(wake-word|not-wake-word) and {db_folder}/<file_id>.wav exists..')
        parser.add_argument('-d', '--data-dir', default='{db_folder}',
                            help='Load files from a different directory')
        args = parser.parse_args()
--- a/requirements.txt
+++ b/requirements.txt
@ -4,7 +4,6 @@ altgraph==0.15
 banal==0.3.3
 bleach==1.5.0
 chardet==3.0.4
-dataset==1.0.5
 enum-compat==0.0.2
 future==0.16.0
 h5py==2.7.1
@ -27,7 +26,6 @@ PyYAML==3.12
 scipy==1.0.0
 six==1.11.0
 speechpy==2.1
-SQLAlchemy==1.2.2
 tensorflow==1.4.1
 tensorflow-tensorboard==0.4.0
 typing==3.6.4
--- a/setup.py
+++ b/setup.py
@ -48,7 +48,6 @@ setup(
        'h5py',
        'wavio',
        'typing',
-        'dataset',
        'prettyparse',
        'precise-runner'
    ],