CS224N_proj/reader.py at master · axelsly/CS224N_proj · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import tensorflow as tf
import numpy as np
import os
import re
import json
import pickle

# Metadata + ~50 characters, then sliding window of (t+1)
# Feed dict should pass in an intial state (previous final state)
# Train on entire song & Batching for different songs?
# Train on individual window examples


def abc_filenames(datapath):
    return [os.path.join(datapath, f) for f in os.listdir(datapath) if os.path.isfile(os.path.join(datapath, f))]


def abc_batch(iterable, n=1):
    l = len(iterable)
    batches = []
    for ndx in range(0, l, n):
        if min(ndx + n, l) - ndx == n:
            batches.append(iterable[ndx:(ndx + n)])
    return batches


def read_abc_pickle(train_file):
    with open(train_file, 'r') as fd:
        return pickle.load(fd)


def compute_save_vocabulary(datapath):
    # Iterate through whole dataset directory
    filenames = abc_filenames(datapath)
    unique_characters = set([])
    for filename in filenames:
        characters = read_abc(filename)
        unique_characters.update(characters)

    vocabulary = dict(zip(unique_characters, range(len(unique_characters))))
    with open('vocabulary.json', 'w') as v:
        json.dump(vocabulary, v)


def load_vocabulary():
    with open('vocabulary.json', 'r') as v:
        return json.load(v)


def get_abs_files(datapath):
    filenames = abc_filenames(datapath)
    abc_songs = [] # Encoded as indicies
    for filename in filenames:
        characters = read_abc(filename)
        abc_songs.append(characters)
    return abc_songs


def abc_to_index(filename, vocabulary):
    characters = read_abc(filename)
    character_indicies = [vocabulary[char] for char in characters]
    return character_indicies


def read_abc(filename, exclude_title=True):
    with open(filename, 'r') as f:
        data = [line for line in f]
        if exclude_title:
            data = data[1:]
        # 4 metadata 'symbols'
        metadata = [re.split(":|\r\n", meta)[1].lower() for meta in data[:-1]]
        return metadata + list(re.split("\r\r\n",data[-1])[0])


def abc_producer(char_ids, batch_size):
    pass


def main(_):
    datapath = "sample_data"
    compute_save_vocabulary(datapath)
    vocabulary = load_vocabulary()
    print vocabulary
    filename = "sample_data/Zycanthos jig_0.abc"
    abc_indecies = abc_to_index(filename, vocabulary)


if __name__ == "__main__":
    tf.app.run()