node-splitta/src/Model.coffee at master · contours/node-splitta · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
async = require "async"
fs = require "fs"
zlib = require "zlib"
{carry} = require "carrier"
{spawn} = require "child_process"
{Pool} = require "generic-pool"
{Counter} = require "./Counter"
{Document} = require "./Document"

class Model

  constructor: (@path) ->
    @features = {}
    @lower_words = new Counter
    @non_abbrs = new Counter
    @pool = null
    @cleanup = (err) => @close -> throw err
    process.on "uncaughtException", @cleanup

  load: (callback) ->
    model_file_path = @path + "/svm_model"
    unless fs.existsSync model_file_path
      return callback new Error "#{model_file_path} does not exist"
    async.parallel {
      features: (callback) =>
        @loadGzippedJSON (@path + "/features.json.gz"), callback
      lower_words: (callback) =>
        @loadGzippedJSON (@path + "/lower_words.json.gz"), callback
      non_abbrs: (callback) =>
        @loadGzippedJSON (@path + "/non_abbrs.json.gz"), callback
    }, (err, o) =>
      return callback err if err?
      if (f for own f of o.features).length == 0
        return callback new Error "model has no features"
      @features = o.features
      @lower_words = new Counter o.lower_words
      @non_abbrs = new Counter o.non_abbrs
      @pool = Pool {
        name: 'svmclassify'
        create: (callback) ->
          classifier = spawn "svm_classifyd", [model_file_path]
          classifier.carrier = carry classifier.stdout
          callback null, classifier
        destroy: (child) ->
          child.kill "SIGINT"
        max: 5
        idleTimeoutMillis: 5000
      }
      callback()

  close: (callback) ->
    process.removeListener "uncaughtException", @cleanup
    return callback() unless @pool?
    @pool.drain => @pool.destroyAllNow -> callback()

  loadGzippedJSON: (path, callback) ->
    try
      zlib.gunzip fs.readFileSync(path), (err, buffer) ->
        return callback err if err?
        callback null, JSON.parse buffer
    catch err
      callback err

  logistic: (x, y=1) ->
    return 1.0 / (1 + Math.pow Math.E, (-1 * y * x))

  classify: (doc, callback) ->
    return callback new Error "model has not been loaded" unless @pool?
    try
      @pool.acquire (err, classifier) =>
        if err?
          @close -> callback err
          return

        fragments = doc.getFragments()

        # callback with an err if classifier prints to stderr
        classifier.stderr.on "data", (data) ->
          @close -> callback new Error data.toString()

        # callback with an err if the classifier dies or is killed abnormally
        classifier.on "exit", (code, signal) ->
          unless signal == "SIGINT"
            @close ->
              if code?
                callback new Error "classifer exited with code #{code})"
              else if signal?
                callback new Error "classifer killed with #{signal}"

        # parse classifier output
        index = 0
        classifier.carrier.on "line", (line) =>
          value = parseFloat line
          if isNaN value
            @close -> callback new Error "unexpected output: #{line}"
            return
          if index == fragments.length
            if classifier?
              classifier.carrier.removeAllListeners()
              classifier.stderr.removeAllListeners()
              classifier.removeAllListeners()
              @pool.release classifier
              classifier = null
              callback null
          else
            fragments[index++].prediction = @logistic value

        # format fragment features and send to classifier
        for frag in fragments
          feats = (@features[f] for f in frag.getFeatures() when f of @features)
          feats.sort (x,y) -> x-y
          classifier.stdin.write(
            "0 " + ("#{f}:1" for f in feats).join(" ") + "\n")
        classifier.stdin.write "\n"
    catch err
      callback err
  segment: (text, callback) ->
    doc = new Document text
    doc.featurize this
    @classify doc, (err) ->
      return callback err if err?
      callback null, doc.segment()

#     def prep(self, doc):
#         self.lower_words, self.non_abbrs = doc.get_stats(verbose=False)
#         self.lower_words = dict(self.lower_words)
#         self.non_abbrs = dict(self.non_abbrs)

#     def train(self, doc):
#         abstract

#     def save(self):
#         """
#         save model objects in self.path
#         """
#         sbd_util.save_pickle(self.feats, self.path + 'feats')
#         sbd_util.save_pickle(self.lower_words, self.path + 'lower_words')
#         sbd_util.save_pickle(self.non_abbrs, self.path + 'non_abbrs')

exports.Model = Model