RubixML
diff --git a/‎.gitignore
+3-3 b/‎.gitignore
+3-3
diff --git a/‎CHANGELOG.md
+5 b/‎CHANGELOG.md
+5
diff --git a/‎benchmarks/Transformers/BM25TransformerBench.php
+60 b/‎benchmarks/Transformers/BM25TransformerBench.php
+60
diff --git a/‎docs/classifiers/multilayer-perceptron.md
+16 b/‎docs/classifiers/multilayer-perceptron.md
+16
diff --git a/‎docs/datasets/api.md
+6 b/‎docs/datasets/api.md
+6
diff --git a/‎docs/images/neural-network-graph.png
80.6 KB b/‎docs/images/neural-network-graph.png
80.6 KB
diff --git a/‎docs/preprocessing.md
+1 b/‎docs/preprocessing.md
+1
diff --git a/‎docs/regressors/mlp-regressor.md
+16 b/‎docs/regressors/mlp-regressor.md
+16
diff --git a/‎docs/transformers/bm25-transformer.md
+38 b/‎docs/transformers/bm25-transformer.md
+38
diff --git a/‎mkdocs.yml
+1 b/‎mkdocs.yml
+1
diff --git a/‎src/Classifiers/MultilayerPerceptron.php
+16 b/‎src/Classifiers/MultilayerPerceptron.php
+16
diff --git a/‎src/Datasets/Dataset.php
+15 b/‎src/Datasets/Dataset.php
+15
diff --git a/‎src/NeuralNet/FeedForward.php
+30 b/‎src/NeuralNet/FeedForward.php
+30
diff --git a/‎src/NeuralNet/Layers/Binary.php
+12 b/‎src/NeuralNet/Layers/Binary.php
+12
diff --git a/‎src/NeuralNet/Layers/Continuous.php
+12 b/‎src/NeuralNet/Layers/Continuous.php
+12
diff --git a/‎src/NeuralNet/Layers/Hidden.php
+1-2 b/‎src/NeuralNet/Layers/Hidden.php
+1-2
diff --git a/‎src/NeuralNet/Layers/Layer.php
+2-1 b/‎src/NeuralNet/Layers/Layer.php
+2-1
diff --git a/‎src/NeuralNet/Layers/Multiclass.php
+12 b/‎src/NeuralNet/Layers/Multiclass.php
+12
diff --git a/‎src/NeuralNet/Layers/Placeholder1D.php
+12 b/‎src/NeuralNet/Layers/Placeholder1D.php
+12
@@ -8,6 +8,6 @@ Thumbs.db
 .DS_Store
 debug.log
 /test.png
-/.idea
-/.vscode
-/.vs
+.idea
+.vscode
+.vs
@@ -1,3 +1,8 @@
+- 2.3.0
+    - Added BM25 Transformer
+    - Add `dropFeature()` method to the dataset object API
+    - Add neural network architecture visualization via GraphViz
+    
 - 2.2.2
     - Fix Grid Search best model selection
 
 
@@ -0,0 +1,60 @@
+<?php
+
+namespace Rubix\ML\Benchmarks\Transformers;
+
+use Tensor\Matrix;
+use Rubix\ML\Datasets\Unlabeled;
+use Rubix\ML\Transformers\BM25Transformer;
+
+/**
+ * @Groups({"Transformers"})
+ * @BeforeMethods({"setUp"})
+ */
+class BM25TransformerBench
+{
+    protected const NUM_SAMPLES = 10000;
+
+    /**
+     * @var \Rubix\ML\Datasets\Unlabeled
+     */
+    protected $dataset;
+
+    /**
+     * @var \Rubix\ML\Transformers\BM25Transformer
+     */
+    protected $transformer;
+
+    /**
+     * @var array<array<mixed>>
+     */
+    protected $aSamples;
+
+    /**
+     * @var array<array<mixed>>
+     */
+    protected $bSamples;
+
+    public function setUp() : void
+    {
+        $mask = Matrix::rand(self::NUM_SAMPLES, 100)
+            ->greater(0.8);
+
+        $samples = Matrix::gaussian(self::NUM_SAMPLES, 100)
+            ->multiply($mask)
+            ->asArray();
+
+        $this->dataset = Unlabeled::quick($samples);
+
+        $this->transformer = new BM25Transformer();
+    }
+
+    /**
+     * @Subject
+     * @Iterations(3)
+     * @OutputTimeUnit("milliseconds", precision=3)
+     */
+    public function apply() : void
+    {
+        $this->dataset->apply($this->transformer);
+    }
+}
@@ -77,6 +77,22 @@ Returns the underlying neural network instance or `null` if untrained:
 public network() : Network|null
 ```
 
+Export a Graphviz "dot" encoding of the neural network architecture.
+```php
+public exportGraphviz() : Encoding
+```
+
+```php
+use Rubix\ML\Helpers\Graphviz;
+use Rubix\ML\Persisters\Filesystem;
+
+$dot = $estimator->exportGraphviz();
+
+Graphviz::dotToImage($dot)->saveTo(new Filesystem('network.png'));
+```
+
+![Neural Network Graph](https://github.com/RubixML/ML/blob/master/docs/images/neural-network-graph.png?raw=true)
+
 ## References
 [^1]: G. E. Hinton. (1989). Connectionist learning procedures.
 [^2]: L. Prechelt. (1997). Early Stopping - but when?
@@ -101,6 +101,12 @@ Select the values of a feature column at a given offset :
 public feature(int $offset) : mixed[]
 ```
 
+## Dropping
+Drop a feature at a given column offset from the dataset:
+```php
+public dropFeature(int $offset) : self
+```
+
 ## Head and Tail
 Return the first *n* rows of data in a new dataset object:
 ```php
 
@@ -110,6 +110,7 @@ The library provides a number of transformers for Natural Language Processing (N
 
 | Transformer | Supervised | [Stateful](transformers/api.md#stateful) | [Elastic](transformers/api.md#elastic) |
 |---|---|---|---|
+| [BM25 Transformer](transformers/bm25-transformer.md) | | ● | ● |
 | [Regex Filter](transformers/regex-filter.md) | | | |
 | [Text Normalizer](transformers/text-normalizer.md) | | | |
 | [Multibyte Text Normalizer](transformers/multibyte-text-normalizer.md) | | | |
 
@@ -75,6 +75,22 @@ Returns the underlying neural network instance or `null` if untrained:
 public network() : Network|null
 ```
 
+Export a Graphviz "dot" encoding of the neural network architecture.
+```php
+public exportGraphviz() : Encoding
+```
+
+```php
+use Rubix\ML\Helpers\Graphviz;
+use Rubix\ML\Persisters\Filesystem;
+
+$dot = $estimator->exportGraphviz();
+
+Graphviz::dotToImage($dot)->saveTo(new Filesystem('network.png'));
+```
+
+![Neural Network Graph](https://github.com/RubixML/ML/blob/master/docs/images/neural-network-graph.png?raw=true)
+
 ## References
 [^1]: G. E. Hinton. (1989). Connectionist learning procedures.
 [^2]: L. Prechelt. (1997). Early Stopping - but when?
@@ -0,0 +1,38 @@
+<span style="float:right;"><a href="https://github.com/RubixML/ML/blob/master/src/Transformers/BM25Transformer.php">[source]</a></span>
+
+# BM25 Transformer
+BM25 is a sublinear term weighting scheme that takes term frequency (TF), document frequency (DF), and document length into account. It is similar to [TF-IDF](tf-idf-transformer.md) but with variable sublinearity and the addition of document length normalization.
+
+> **Note:** BM25 Transformer assumes that its inputs are token frequency vectors such as those created by [Word Count Vectorizer](word-count-vectorizer.md).
+
+**Interfaces:** [Transformer](api.md#transformer), [Stateful](api.md#stateful), [Elastic](api.md#elastic)
+
+**Data Type Compatibility:** Continuous only
+
+## Parameters
+| # | Param | Default | Type | Description |
+|---|---|---|---|---|
+| 1 | dampening | 1.2 | float | The term frequency (TF) dampening factor i.e. the `K1` parameter in the formula. Lower values will cause the TF to saturate quicker. |
+| 2 | normalization | 0.75 | float | The importance of document length in normalizing the term frequency i.e. the `b` parameter in the formula. |
+
+## Example
+```php
+use Rubix\ML\Transformers\BM25Transformer;
+
+$transformer = new BM25Transformer(1.2, 0.75);
+```
+
+## Additional Methods
+Return the document frequencies calculated during fitting:
+```php
+public dfs() : ?array
+```
+
+Return the average number of tokens per document:
+```php
+public averageDocumentLength() : ?float
+```
+
+### References
+>- S. Robertson et al. (2009). The Probabilistic Relevance Framework: BM25 and Beyond.
+>- K. Sparck Jones et al. (2000). A probabilistic model of information retrieval: development and comparative experiments.
@@ -139,6 +139,7 @@ nav:
         - KNN Imputer: transformers/knn-imputer.md
         - Missing Data Imputer: transformers/missing-data-imputer.md
       - Natural Language:
+        - BM25 Transformer: transformers/bm25-transformer.md
         - Regex Filter: transformers/regex-filter.md
         - Text Normalizer: transformers/text-normalizer.md
         - Multibyte Text Normalizer: transformers/multibyte-text-normalizer.md
 
@@ -6,6 +6,7 @@
 use Rubix\ML\Learner;
 use Rubix\ML\Verbose;
 use Rubix\ML\DataType;
+use Rubix\ML\Encoding;
 use Rubix\ML\Estimator;
 use Rubix\ML\Persistable;
 use Rubix\ML\Probabilistic;
@@ -544,6 +545,21 @@ public function proba(Dataset $dataset) : array
         return $probabilities;
     }
 
+    /**
+     * Export the network architecture as a graph in dot format.
+     *
+     * @throws \Rubix\ML\Exceptions\RuntimeException
+     * @return \Rubix\ML\Encoding
+     */
+    public function exportGraphviz() : Encoding
+    {
+        if (!$this->network) {
+            throw new RuntimeException('Must train network first.');
+        }
+
+        return $this->network->exportGraphviz();
+    }
+
     /**
      * Return an associative array containing the data used to serialize the object.
      *
 
@@ -196,6 +196,21 @@ public function feature(int $offset) : array
         return array_column($this->samples, $offset);
     }
 
+    /**
+     * Drop a feature column at a given offset from the dataset.
+     *
+     * @param int $offset
+     * @return self
+     */
+    public function dropFeature(int $offset) : self
+    {
+        foreach ($this->samples as &$sample) {
+            array_splice($sample, $offset, 1);
+        }
+
+        return $this;
+    }
+
     /**
      * Rotate the sample matrix so that the values of each feature become rows.
      *
 
@@ -3,6 +3,7 @@
 namespace Rubix\ML\NeuralNet;
 
 use Tensor\Matrix;
+use Rubix\ML\Encoding;
 use Rubix\ML\Datasets\Dataset;
 use Rubix\ML\Datasets\Labeled;
 use Rubix\ML\NeuralNet\Layers\Input;
@@ -218,4 +219,33 @@ public function backpropagate(array $labels) : float
 
         return $loss;
     }
+
+    /**
+     * Export the network architecture as a graph in dot format.
+     *
+     * @return \Rubix\ML\Encoding
+     */
+    public function exportGraphviz() : Encoding
+    {
+        $dot = 'digraph Tree {' . PHP_EOL;
+        $dot .= '  node [shape=box, fontname=helvetica];' . PHP_EOL;
+
+        $layerNum = 0;
+
+        foreach ($this->layers() as $layer) {
+            ++$layerNum;
+
+            $dot .= "  N$layerNum [label=\"$layer\",style=\"rounded\"]" . PHP_EOL;
+
+            if ($layerNum > 1) {
+                $parentId = $layerNum - 1;
+
+                $dot .= "  N{$parentId} -> N{$layerNum};" . PHP_EOL;
+            }
+        }
+
+        $dot .= '}';
+
+        return new Encoding($dot);
+    }
 }
@@ -199,4 +199,16 @@ public function gradient(Matrix $input, Matrix $output, Matrix $expected) : Matr
         return $this->sigmoid->differentiate($input, $output)
             ->multiply($dLoss);
     }
+
+    /**
+     * Return the string representation of the object.
+     *
+     * @internal
+     *
+     * @return string
+     */
+    public function __toString() : string
+    {
+        return "Binary (cost function: {$this->costFn})";
+    }
 }
@@ -137,4 +137,16 @@ public function gradient(Matrix $input, Matrix $expected) : Matrix
         return $this->costFn->differentiate($input, $expected)
             ->divide($input->n());
     }
+
+    /**
+     * Return the string representation of the object.
+     *
+     * @internal
+     *
+     * @return string
+     */
+    public function __toString() : string
+    {
+        return "Continuous (cost function: {$this->costFn})";
+    }
 }
@@ -4,7 +4,6 @@
 
 use Rubix\ML\Deferred;
 use Rubix\ML\NeuralNet\Optimizers\Optimizer;
-use Stringable;
 
 /**
  * Hidden
@@ -13,7 +12,7 @@
  * @package     Rubix/ML
  * @author      Andrew DalPino
  */
-interface Hidden extends Layer, Stringable
+interface Hidden extends Layer
 {
     /**
      * Calculate the gradient and update the parameters of the layer.
 
@@ -3,8 +3,9 @@
 namespace Rubix\ML\NeuralNet\Layers;
 
 use Tensor\Matrix;
+use Stringable;
 
-interface Layer
+interface Layer extends Stringable
 {
     /**
      * The width of the layer. i.e. the number of neurons or computation nodes.
 
@@ -205,4 +205,16 @@ public function gradient(Matrix $input, Matrix $output, Matrix $expected) : Matr
         return $this->softmax->differentiate($input, $output)
             ->multiply($dLoss);
     }
+
+    /**
+     * Return the string representation of the object.
+     *
+     * @internal
+     *
+     * @return string
+     */
+    public function __toString() : string
+    {
+        return "Multiclass (cost function: {$this->costFn})";
+    }
 }
@@ -88,4 +88,16 @@ public function infer(Matrix $input) : Matrix
     {
         return $this->forward($input);
     }
+
+    /**
+     * Return the string representation of the object.
+     *
+     * @internal
+     *
+     * @return string
+     */
+    public function __toString() : string
+    {
+        return "Placeholder 1D (inputs: {$this->inputs})";
+    }
 }
Original file line number	Diff line number	Diff line change
`@@ -3,8 +3,9 @@`
`3`	`3`	`namespace Rubix\ML\NeuralNet\Layers;`
`4`	`4`
`5`	`5`	`use Tensor\Matrix;`
	`6`	`+use Stringable;`
`6`	`7`
`7`		`-interface Layer`
	`8`	`+interface Layer extends Stringable`
`8`	`9`	`{`
`9`	`10`	`/**`
`10`	`11`	`* The width of the layer. i.e. the number of neurons or computation nodes.`
Original file line number	Diff line number	Diff line change
`@@ -88,4 +88,16 @@ public function infer(Matrix $input) : Matrix`
`88`	`88`	`{`
`89`	`89`	`return $this->forward($input);`
`90`	`90`	`}`
	`91`	`+`
	`92`	`+ /**`
	`93`	`+ * Return the string representation of the object.`
	`94`	`+ *`
	`95`	`+ * @internal`
	`96`	`+ *`
	`97`	`+ * @return string`
	`98`	`+ */`
	`99`	`+ public function __toString() : string`
	`100`	`+ {`
	`101`	`+ return "Placeholder 1D (inputs: {$this->inputs})";`
	`102`	`+ }`
`91`	`103`	`}`