Skip to content

Commit d0de8a0

Browse files
Merge pull request #271 from RubixML/2.3
2.3
2 parents 3f0b211 + 3c76343 commit d0de8a0

30 files changed

+668
-10
lines changed

.gitignore

+3-3
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,6 @@ Thumbs.db
88
.DS_Store
99
debug.log
1010
/test.png
11-
/.idea
12-
/.vscode
13-
/.vs
11+
.idea
12+
.vscode
13+
.vs

CHANGELOG.md

+5
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,8 @@
1+
- 2.3.0
2+
- Added BM25 Transformer
3+
- Add `dropFeature()` method to the dataset object API
4+
- Add neural network architecture visualization via GraphViz
5+
16
- 2.2.2
27
- Fix Grid Search best model selection
38

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
<?php
2+
3+
namespace Rubix\ML\Benchmarks\Transformers;
4+
5+
use Tensor\Matrix;
6+
use Rubix\ML\Datasets\Unlabeled;
7+
use Rubix\ML\Transformers\BM25Transformer;
8+
9+
/**
10+
* @Groups({"Transformers"})
11+
* @BeforeMethods({"setUp"})
12+
*/
13+
class BM25TransformerBench
14+
{
15+
protected const NUM_SAMPLES = 10000;
16+
17+
/**
18+
* @var \Rubix\ML\Datasets\Unlabeled
19+
*/
20+
protected $dataset;
21+
22+
/**
23+
* @var \Rubix\ML\Transformers\BM25Transformer
24+
*/
25+
protected $transformer;
26+
27+
/**
28+
* @var array<array<mixed>>
29+
*/
30+
protected $aSamples;
31+
32+
/**
33+
* @var array<array<mixed>>
34+
*/
35+
protected $bSamples;
36+
37+
public function setUp() : void
38+
{
39+
$mask = Matrix::rand(self::NUM_SAMPLES, 100)
40+
->greater(0.8);
41+
42+
$samples = Matrix::gaussian(self::NUM_SAMPLES, 100)
43+
->multiply($mask)
44+
->asArray();
45+
46+
$this->dataset = Unlabeled::quick($samples);
47+
48+
$this->transformer = new BM25Transformer();
49+
}
50+
51+
/**
52+
* @Subject
53+
* @Iterations(3)
54+
* @OutputTimeUnit("milliseconds", precision=3)
55+
*/
56+
public function apply() : void
57+
{
58+
$this->dataset->apply($this->transformer);
59+
}
60+
}

docs/classifiers/multilayer-perceptron.md

+16
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,22 @@ Returns the underlying neural network instance or `null` if untrained:
7777
public network() : Network|null
7878
```
7979

80+
Export a Graphviz "dot" encoding of the neural network architecture.
81+
```php
82+
public exportGraphviz() : Encoding
83+
```
84+
85+
```php
86+
use Rubix\ML\Helpers\Graphviz;
87+
use Rubix\ML\Persisters\Filesystem;
88+
89+
$dot = $estimator->exportGraphviz();
90+
91+
Graphviz::dotToImage($dot)->saveTo(new Filesystem('network.png'));
92+
```
93+
94+
![Neural Network Graph](https://github.com/RubixML/ML/blob/master/docs/images/neural-network-graph.png?raw=true)
95+
8096
## References
8197
[^1]: G. E. Hinton. (1989). Connectionist learning procedures.
8298
[^2]: L. Prechelt. (1997). Early Stopping - but when?

docs/datasets/api.md

+6
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,12 @@ Select the values of a feature column at a given offset :
101101
public feature(int $offset) : mixed[]
102102
```
103103

104+
## Dropping
105+
Drop a feature at a given column offset from the dataset:
106+
```php
107+
public dropFeature(int $offset) : self
108+
```
109+
104110
## Head and Tail
105111
Return the first *n* rows of data in a new dataset object:
106112
```php

docs/images/neural-network-graph.png

80.6 KB
Loading

docs/preprocessing.md

+1
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ The library provides a number of transformers for Natural Language Processing (N
110110

111111
| Transformer | Supervised | [Stateful](transformers/api.md#stateful) | [Elastic](transformers/api.md#elastic) |
112112
|---|---|---|---|
113+
| [BM25 Transformer](transformers/bm25-transformer.md) | |||
113114
| [Regex Filter](transformers/regex-filter.md) | | | |
114115
| [Text Normalizer](transformers/text-normalizer.md) | | | |
115116
| [Multibyte Text Normalizer](transformers/multibyte-text-normalizer.md) | | | |

docs/regressors/mlp-regressor.md

+16
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,22 @@ Returns the underlying neural network instance or `null` if untrained:
7575
public network() : Network|null
7676
```
7777

78+
Export a Graphviz "dot" encoding of the neural network architecture.
79+
```php
80+
public exportGraphviz() : Encoding
81+
```
82+
83+
```php
84+
use Rubix\ML\Helpers\Graphviz;
85+
use Rubix\ML\Persisters\Filesystem;
86+
87+
$dot = $estimator->exportGraphviz();
88+
89+
Graphviz::dotToImage($dot)->saveTo(new Filesystem('network.png'));
90+
```
91+
92+
![Neural Network Graph](https://github.com/RubixML/ML/blob/master/docs/images/neural-network-graph.png?raw=true)
93+
7894
## References
7995
[^1]: G. E. Hinton. (1989). Connectionist learning procedures.
8096
[^2]: L. Prechelt. (1997). Early Stopping - but when?

docs/transformers/bm25-transformer.md

+38
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
<span style="float:right;"><a href="https://github.com/RubixML/ML/blob/master/src/Transformers/BM25Transformer.php">[source]</a></span>
2+
3+
# BM25 Transformer
4+
BM25 is a sublinear term weighting scheme that takes term frequency (TF), document frequency (DF), and document length into account. It is similar to [TF-IDF](tf-idf-transformer.md) but with variable sublinearity and the addition of document length normalization.
5+
6+
> **Note:** BM25 Transformer assumes that its inputs are token frequency vectors such as those created by [Word Count Vectorizer](word-count-vectorizer.md).
7+
8+
**Interfaces:** [Transformer](api.md#transformer), [Stateful](api.md#stateful), [Elastic](api.md#elastic)
9+
10+
**Data Type Compatibility:** Continuous only
11+
12+
## Parameters
13+
| # | Param | Default | Type | Description |
14+
|---|---|---|---|---|
15+
| 1 | dampening | 1.2 | float | The term frequency (TF) dampening factor i.e. the `K1` parameter in the formula. Lower values will cause the TF to saturate quicker. |
16+
| 2 | normalization | 0.75 | float | The importance of document length in normalizing the term frequency i.e. the `b` parameter in the formula. |
17+
18+
## Example
19+
```php
20+
use Rubix\ML\Transformers\BM25Transformer;
21+
22+
$transformer = new BM25Transformer(1.2, 0.75);
23+
```
24+
25+
## Additional Methods
26+
Return the document frequencies calculated during fitting:
27+
```php
28+
public dfs() : ?array
29+
```
30+
31+
Return the average number of tokens per document:
32+
```php
33+
public averageDocumentLength() : ?float
34+
```
35+
36+
### References
37+
>- S. Robertson et al. (2009). The Probabilistic Relevance Framework: BM25 and Beyond.
38+
>- K. Sparck Jones et al. (2000). A probabilistic model of information retrieval: development and comparative experiments.

mkdocs.yml

+1
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,7 @@ nav:
139139
- KNN Imputer: transformers/knn-imputer.md
140140
- Missing Data Imputer: transformers/missing-data-imputer.md
141141
- Natural Language:
142+
- BM25 Transformer: transformers/bm25-transformer.md
142143
- Regex Filter: transformers/regex-filter.md
143144
- Text Normalizer: transformers/text-normalizer.md
144145
- Multibyte Text Normalizer: transformers/multibyte-text-normalizer.md

src/Classifiers/MultilayerPerceptron.php

+16
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
use Rubix\ML\Learner;
77
use Rubix\ML\Verbose;
88
use Rubix\ML\DataType;
9+
use Rubix\ML\Encoding;
910
use Rubix\ML\Estimator;
1011
use Rubix\ML\Persistable;
1112
use Rubix\ML\Probabilistic;
@@ -544,6 +545,21 @@ public function proba(Dataset $dataset) : array
544545
return $probabilities;
545546
}
546547

548+
/**
549+
* Export the network architecture as a graph in dot format.
550+
*
551+
* @throws \Rubix\ML\Exceptions\RuntimeException
552+
* @return \Rubix\ML\Encoding
553+
*/
554+
public function exportGraphviz() : Encoding
555+
{
556+
if (!$this->network) {
557+
throw new RuntimeException('Must train network first.');
558+
}
559+
560+
return $this->network->exportGraphviz();
561+
}
562+
547563
/**
548564
* Return an associative array containing the data used to serialize the object.
549565
*

src/Datasets/Dataset.php

+15
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,21 @@ public function feature(int $offset) : array
196196
return array_column($this->samples, $offset);
197197
}
198198

199+
/**
200+
* Drop a feature column at a given offset from the dataset.
201+
*
202+
* @param int $offset
203+
* @return self
204+
*/
205+
public function dropFeature(int $offset) : self
206+
{
207+
foreach ($this->samples as &$sample) {
208+
array_splice($sample, $offset, 1);
209+
}
210+
211+
return $this;
212+
}
213+
199214
/**
200215
* Rotate the sample matrix so that the values of each feature become rows.
201216
*

src/NeuralNet/FeedForward.php

+30
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
namespace Rubix\ML\NeuralNet;
44

55
use Tensor\Matrix;
6+
use Rubix\ML\Encoding;
67
use Rubix\ML\Datasets\Dataset;
78
use Rubix\ML\Datasets\Labeled;
89
use Rubix\ML\NeuralNet\Layers\Input;
@@ -218,4 +219,33 @@ public function backpropagate(array $labels) : float
218219

219220
return $loss;
220221
}
222+
223+
/**
224+
* Export the network architecture as a graph in dot format.
225+
*
226+
* @return \Rubix\ML\Encoding
227+
*/
228+
public function exportGraphviz() : Encoding
229+
{
230+
$dot = 'digraph Tree {' . PHP_EOL;
231+
$dot .= ' node [shape=box, fontname=helvetica];' . PHP_EOL;
232+
233+
$layerNum = 0;
234+
235+
foreach ($this->layers() as $layer) {
236+
++$layerNum;
237+
238+
$dot .= " N$layerNum [label=\"$layer\",style=\"rounded\"]" . PHP_EOL;
239+
240+
if ($layerNum > 1) {
241+
$parentId = $layerNum - 1;
242+
243+
$dot .= " N{$parentId} -> N{$layerNum};" . PHP_EOL;
244+
}
245+
}
246+
247+
$dot .= '}';
248+
249+
return new Encoding($dot);
250+
}
221251
}

src/NeuralNet/Layers/Binary.php

+12
Original file line numberDiff line numberDiff line change
@@ -199,4 +199,16 @@ public function gradient(Matrix $input, Matrix $output, Matrix $expected) : Matr
199199
return $this->sigmoid->differentiate($input, $output)
200200
->multiply($dLoss);
201201
}
202+
203+
/**
204+
* Return the string representation of the object.
205+
*
206+
* @internal
207+
*
208+
* @return string
209+
*/
210+
public function __toString() : string
211+
{
212+
return "Binary (cost function: {$this->costFn})";
213+
}
202214
}

src/NeuralNet/Layers/Continuous.php

+12
Original file line numberDiff line numberDiff line change
@@ -137,4 +137,16 @@ public function gradient(Matrix $input, Matrix $expected) : Matrix
137137
return $this->costFn->differentiate($input, $expected)
138138
->divide($input->n());
139139
}
140+
141+
/**
142+
* Return the string representation of the object.
143+
*
144+
* @internal
145+
*
146+
* @return string
147+
*/
148+
public function __toString() : string
149+
{
150+
return "Continuous (cost function: {$this->costFn})";
151+
}
140152
}

src/NeuralNet/Layers/Hidden.php

+1-2
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44

55
use Rubix\ML\Deferred;
66
use Rubix\ML\NeuralNet\Optimizers\Optimizer;
7-
use Stringable;
87

98
/**
109
* Hidden
@@ -13,7 +12,7 @@
1312
* @package Rubix/ML
1413
* @author Andrew DalPino
1514
*/
16-
interface Hidden extends Layer, Stringable
15+
interface Hidden extends Layer
1716
{
1817
/**
1918
* Calculate the gradient and update the parameters of the layer.

src/NeuralNet/Layers/Layer.php

+2-1
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,9 @@
33
namespace Rubix\ML\NeuralNet\Layers;
44

55
use Tensor\Matrix;
6+
use Stringable;
67

7-
interface Layer
8+
interface Layer extends Stringable
89
{
910
/**
1011
* The width of the layer. i.e. the number of neurons or computation nodes.

src/NeuralNet/Layers/Multiclass.php

+12
Original file line numberDiff line numberDiff line change
@@ -205,4 +205,16 @@ public function gradient(Matrix $input, Matrix $output, Matrix $expected) : Matr
205205
return $this->softmax->differentiate($input, $output)
206206
->multiply($dLoss);
207207
}
208+
209+
/**
210+
* Return the string representation of the object.
211+
*
212+
* @internal
213+
*
214+
* @return string
215+
*/
216+
public function __toString() : string
217+
{
218+
return "Multiclass (cost function: {$this->costFn})";
219+
}
208220
}

src/NeuralNet/Layers/Placeholder1D.php

+12
Original file line numberDiff line numberDiff line change
@@ -88,4 +88,16 @@ public function infer(Matrix $input) : Matrix
8888
{
8989
return $this->forward($input);
9090
}
91+
92+
/**
93+
* Return the string representation of the object.
94+
*
95+
* @internal
96+
*
97+
* @return string
98+
*/
99+
public function __toString() : string
100+
{
101+
return "Placeholder 1D (inputs: {$this->inputs})";
102+
}
91103
}

0 commit comments

Comments
 (0)