/
DeepWalk.java
254 lines (213 loc) · 9.37 KB
/
DeepWalk.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
/*
* ******************************************************************************
* *
* *
* * This program and the accompanying materials are made available under the
* * terms of the Apache License, Version 2.0 which is available at
* * https://www.apache.org/licenses/LICENSE-2.0.
* *
* * See the NOTICE file distributed with this work for additional
* * information regarding copyright ownership.
* * Unless required by applicable law or agreed to in writing, software
* * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* * License for the specific language governing permissions and limitations
* * under the License.
* *
* * SPDX-License-Identifier: Apache-2.0
* *****************************************************************************
*/
package org.deeplearning4j.graph.models.deepwalk;
import lombok.AllArgsConstructor;
import org.deeplearning4j.graph.api.IGraph;
import org.deeplearning4j.graph.api.IVertexSequence;
import org.deeplearning4j.graph.api.NoEdgeHandling;
import org.deeplearning4j.graph.iterator.GraphWalkIterator;
import org.deeplearning4j.graph.iterator.parallel.GraphWalkIteratorProvider;
import org.deeplearning4j.graph.iterator.parallel.RandomWalkGraphIteratorProvider;
import org.deeplearning4j.graph.models.embeddings.GraphVectorLookupTable;
import org.deeplearning4j.graph.models.embeddings.GraphVectorsImpl;
import org.deeplearning4j.graph.models.embeddings.InMemoryGraphLookupTable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.threadly.concurrent.PriorityScheduler;
import org.threadly.concurrent.future.FutureUtils;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.*;
import java.util.concurrent.atomic.AtomicLong;
public class DeepWalk<V, E> extends GraphVectorsImpl<V, E> {
public static final int STATUS_UPDATE_FREQUENCY = 1000;
private Logger log = LoggerFactory.getLogger(DeepWalk.class);
private int vectorSize;
private int windowSize;
private double learningRate;
private boolean initCalled = false;
private long seed;
private int nThreads = Runtime.getRuntime().availableProcessors();
private transient AtomicLong walkCounter = new AtomicLong(0);
public DeepWalk() {
}
public int getVectorSize() {
return vectorSize;
}
public int getWindowSize() {
return windowSize;
}
public double getLearningRate() {
return learningRate;
}
public void setLearningRate(double learningRate) {
this.learningRate = learningRate;
if (lookupTable != null)
lookupTable.setLearningRate(learningRate);
}
/** Initialize the DeepWalk model with a given graph. */
public void initialize(IGraph<V, E> graph) {
int nVertices = graph.numVertices();
int[] degrees = new int[nVertices];
for (int i = 0; i < nVertices; i++)
degrees[i] = graph.getVertexDegree(i);
initialize(degrees);
}
/** Initialize the DeepWalk model with a list of vertex degrees for a graph.<br>
* Specifically, graphVertexDegrees[i] represents the vertex degree of the ith vertex<br>
* vertex degrees are used to construct a binary (Huffman) tree, which is in turn used in
* the hierarchical softmax implementation
* @param graphVertexDegrees degrees of each vertex
*/
public void initialize(int[] graphVertexDegrees) {
log.info("Initializing: Creating Huffman tree and lookup table...");
GraphHuffman gh = new GraphHuffman(graphVertexDegrees.length);
gh.buildTree(graphVertexDegrees);
lookupTable = new InMemoryGraphLookupTable(graphVertexDegrees.length, vectorSize, gh, learningRate);
initCalled = true;
log.info("Initialization complete");
}
/** Fit the model, in parallel.
* This creates a set of GraphWalkIterators, which are then distributed one to each thread
* @param graph Graph to fit
* @param walkLength Length of rangom walks to generate
*/
public void fit(IGraph<V, E> graph, int walkLength) {
if (!initCalled)
initialize(graph);
//First: create iterators, one for each thread
GraphWalkIteratorProvider<V> iteratorProvider = new RandomWalkGraphIteratorProvider<>(graph, walkLength, seed,
NoEdgeHandling.SELF_LOOP_ON_DISCONNECTED);
fit(iteratorProvider);
}
/** Fit the model, in parallel, using a given GraphWalkIteratorProvider.<br>
* This object is used to generate multiple GraphWalkIterators, which can then be distributed to each thread
* to do in parallel<br>
* Note that {@link #fit(IGraph, int)} will be more convenient in many cases<br>
* Note that {@link #initialize(IGraph)} or {@link #initialize(int[])} <em>must</em> be called first.
* @param iteratorProvider GraphWalkIteratorProvider
* @see #fit(IGraph, int)
*/
public void fit(GraphWalkIteratorProvider<V> iteratorProvider) {
if (!initCalled)
throw new UnsupportedOperationException("DeepWalk not initialized (call initialize before fit)");
List<GraphWalkIterator<V>> iteratorList = iteratorProvider.getGraphWalkIterators(nThreads);
PriorityScheduler scheduler = new PriorityScheduler(nThreads);
List<Future<Void>> list = new ArrayList<>(iteratorList.size());
//log.info("Fitting Graph with {} threads", Math.max(nThreads,iteratorList.size()));
for (GraphWalkIterator<V> iter : iteratorList) {
LearningCallable c = new LearningCallable(iter);
list.add(scheduler.submit(c));
}
scheduler.shutdown(); // wont shutdown till complete
try {
FutureUtils.blockTillAllCompleteOrFirstError(list);
} catch (InterruptedException e) {
// should not be possible with blocking till scheduler terminates
Thread.currentThread().interrupt();
throw new RuntimeException(e);
} catch (ExecutionException e) {
throw new RuntimeException(e);
}
}
/**Fit the DeepWalk model <b>using a single thread</b> using a given GraphWalkIterator. If parallel fitting is required,
* {@link #fit(IGraph, int)} or {@link #fit(GraphWalkIteratorProvider)} should be used.<br>
* Note that {@link #initialize(IGraph)} or {@link #initialize(int[])} <em>must</em> be called first.
*
* @param iterator iterator for graph walks
*/
public void fit(GraphWalkIterator<V> iterator) {
if (!initCalled)
throw new UnsupportedOperationException("DeepWalk not initialized (call initialize before fit)");
int walkLength = iterator.walkLength();
while (iterator.hasNext()) {
IVertexSequence<V> sequence = iterator.next();
//Skipgram model:
int[] walk = new int[walkLength + 1];
int i = 0;
while (sequence.hasNext())
walk[i++] = sequence.next().vertexID();
skipGram(walk);
long iter = walkCounter.incrementAndGet();
if (iter % STATUS_UPDATE_FREQUENCY == 0) {
log.info("Processed {} random walks on graph", iter);
}
}
}
private void skipGram(int[] walk) {
for (int mid = windowSize; mid < walk.length - windowSize; mid++) {
for (int pos = mid - windowSize; pos <= mid + windowSize; pos++) {
if (pos == mid)
continue;
//pair of vertices: walk[mid] -> walk[pos]
lookupTable.iterate(walk[mid], walk[pos]);
}
}
}
public GraphVectorLookupTable lookupTable() {
return lookupTable;
}
public static class Builder<V, E> {
private int vectorSize = 100;
private long seed = System.currentTimeMillis();
private double learningRate = 0.01;
private int windowSize = 2;
/** Sets the size of the vectors to be learned for each vertex in the graph */
public Builder<V, E> vectorSize(int vectorSize) {
this.vectorSize = vectorSize;
return this;
}
/** Set the learning rate */
public Builder<V, E> learningRate(double learningRate) {
this.learningRate = learningRate;
return this;
}
/** Sets the window size used in skipgram model */
public Builder<V, E> windowSize(int windowSize) {
this.windowSize = windowSize;
return this;
}
/** Seed for random number generation (used for repeatability).
* Note however that parallel/async gradient descent might result in behaviour that
* is not repeatable, in spite of setting seed
*/
public Builder<V, E> seed(long seed) {
this.seed = seed;
return this;
}
public DeepWalk<V, E> build() {
DeepWalk<V, E> dw = new DeepWalk<>();
dw.vectorSize = vectorSize;
dw.windowSize = windowSize;
dw.learningRate = learningRate;
dw.seed = seed;
return dw;
}
}
@AllArgsConstructor
private class LearningCallable implements Callable<Void> {
private final GraphWalkIterator<V> iterator;
@Override
public Void call() throws Exception {
fit(iterator);
return null;
}
}
}