diff --git a/src/CSharp/CSharpExamples/ForwardForward.cs b/src/CSharp/CSharpExamples/ForwardForward.cs
new file mode 100644
index 0000000..7cf4334
--- /dev/null
+++ b/src/CSharp/CSharpExamples/ForwardForward.cs
@@ -0,0 +1,111 @@
+// Copyright (c) .NET Foundation and Contributors.  All Rights Reserved.  See LICENSE in the project root for license information.
+using System;
+using System.IO;
+using System.Collections.Generic;
+using System.Diagnostics;
+
+using TorchSharp;
+using static TorchSharp.torchvision;
+
+using TorchSharp.Examples;
+using TorchSharp.Examples.Utils;
+
+using static TorchSharp.torch;
+using static TorchSharp.torch.nn;
+using static TorchSharp.torch.nn.functional;
+
+namespace CSharpExamples
+{
+    /// <summary>
+    /// Forward-Forward MNIST classification
+    ///
+    /// Based on: https://github.com/pytorch/examples/tree/main/mnist_forward_forward
+    ///
+    /// Implements the Forward-Forward algorithm (Geoffrey Hinton, 2022). Instead of
+    /// backpropagation, each layer is trained independently using a local contrastive loss.
+    /// Positive examples have the correct label overlaid, negative examples have wrong labels.
+    /// </summary>
+    public class ForwardForward
+    {
+        internal static void Run(int epochs, int timeout, string logdir)
+        {
+            var device =
+                torch.cuda.is_available() ? torch.CUDA :
+                torch.mps_is_available() ? torch.MPS :
+                torch.CPU;
+
+            Console.WriteLine();
+            Console.WriteLine($"\tRunning Forward-Forward MNIST on {device.type} for {epochs} epochs.");
+            Console.WriteLine();
+
+            torch.random.manual_seed(1);
+
+            var dataset = "mnist";
+            var datasetPath = Path.Join(Environment.GetFolderPath(Environment.SpecialFolder.DesktopDirectory), "..", "Downloads", dataset);
+
+            var sourceDir = datasetPath;
+            var targetDir = Path.Combine(datasetPath, "test_data");
+
+            if (!Directory.Exists(targetDir)) {
+                Directory.CreateDirectory(targetDir);
+                Decompress.DecompressGZipFile(Path.Combine(sourceDir, "train-images-idx3-ubyte.gz"), targetDir);
+                Decompress.DecompressGZipFile(Path.Combine(sourceDir, "train-labels-idx1-ubyte.gz"), targetDir);
+                Decompress.DecompressGZipFile(Path.Combine(sourceDir, "t10k-images-idx3-ubyte.gz"), targetDir);
+                Decompress.DecompressGZipFile(Path.Combine(sourceDir, "t10k-labels-idx1-ubyte.gz"), targetDir);
+            }
+
+            Console.WriteLine($"\tLoading data...");
+
+            // Load full training set as a single batch for the Forward-Forward algorithm
+            int trainSize = 50000;
+            int testSize = 10000;
+
+            using (MNISTReader trainReader = new MNISTReader(targetDir, "train", trainSize, device: device),
+                               testReader = new MNISTReader(targetDir, "t10k", testSize, device: device))
+            {
+                Stopwatch totalTime = new Stopwatch();
+                totalTime.Start();
+
+                // Get one big batch of training data
+                Tensor x = null, y = null, xTe = null, yTe = null;
+
+                foreach (var (data, target) in trainReader) {
+                    // Flatten the images: (N, 1, 28, 28) -> (N, 784)
+                    x = data.view(data.shape[0], -1);
+                    y = target;
+                    break; // Just the first (and only) batch
+                }
+
+                foreach (var (data, target) in testReader) {
+                    xTe = data.view(data.shape[0], -1);
+                    yTe = target;
+                    break;
+                }
+
+                Console.WriteLine($"\tCreating Forward-Forward network [784, 500, 500]...");
+
+                var net = new ForwardForwardNet(new int[] { 784, 500, 500 }, device);
+
+                // Create positive and negative examples
+                var xPos = ForwardForwardNet.OverlayLabelOnInput(x, y);
+                var yNeg = ForwardForwardNet.GetNegativeLabels(y);
+                var xNeg = ForwardForwardNet.OverlayLabelOnInput(x, yNeg);
+
+                Console.WriteLine($"\tTraining...");
+                net.Train(xPos, xNeg, epochs, lr: 0.03, logInterval: 10);
+
+                // Evaluate
+                var trainPred = net.Predict(x);
+                var trainError = 1.0f - trainPred.eq(y).to_type(ScalarType.Float32).mean().item<float>();
+                Console.WriteLine($"\tTrain error: {trainError:F4}");
+
+                var testPred = net.Predict(xTe);
+                var testError = 1.0f - testPred.eq(yTe).to_type(ScalarType.Float32).mean().item<float>();
+                Console.WriteLine($"\tTest error: {testError:F4}");
+
+                totalTime.Stop();
+                Console.WriteLine($"Elapsed time: {totalTime.Elapsed.TotalSeconds:F1} s.");
+            }
+        }
+    }
+}
diff --git a/src/CSharp/CSharpExamples/GAT.cs b/src/CSharp/CSharpExamples/GAT.cs
new file mode 100644
index 0000000..32934a0
--- /dev/null
+++ b/src/CSharp/CSharpExamples/GAT.cs
@@ -0,0 +1,123 @@
+// Copyright (c) .NET Foundation and Contributors.  All Rights Reserved.  See LICENSE in the project root for license information.
+using System;
+using System.Diagnostics;
+
+using TorchSharp;
+using TorchSharp.Examples;
+
+using static TorchSharp.torch;
+using static TorchSharp.torch.nn;
+using static TorchSharp.torch.nn.functional;
+
+namespace CSharpExamples
+{
+    /// <summary>
+    /// Graph Attention Network (GAT) for node classification
+    ///
+    /// Based on: https://github.com/pytorch/examples/tree/main/gat
+    ///
+    /// Implements a 2-layer GAT with multi-head attention for semi-supervised
+    /// node classification. Uses synthetic graph data for demonstration.
+    /// </summary>
+    public class GAT
+    {
+        internal static void Run(int epochs, int timeout, string logdir)
+        {
+            var device =
+                torch.cuda.is_available() ? torch.CUDA :
+                torch.mps_is_available() ? torch.MPS :
+                torch.CPU;
+
+            Console.WriteLine();
+            Console.WriteLine($"\tRunning GAT on {device.type} for {epochs} epochs, terminating after {TimeSpan.FromSeconds(timeout)}.");
+            Console.WriteLine();
+
+            torch.random.manual_seed(13);
+
+            // Synthetic graph data (simulating Cora-like structure)
+            int numNodes = 2708;
+            int numFeatures = 1433;
+            int numClasses = 7;
+            int hiddenDim = 64;
+            int numHeads = 8;
+
+            Console.WriteLine($"\tGenerating synthetic graph data...");
+            Console.WriteLine($"\t  Nodes: {numNodes}, Features: {numFeatures}, Classes: {numClasses}");
+            Console.WriteLine($"\t  Hidden: {hiddenDim}, Heads: {numHeads}");
+
+            var features = torch.randn(numNodes, numFeatures, device: device);
+            var labels = torch.randint(numClasses, numNodes, device: device);
+
+            // Create adjacency matrix with self-loops
+            var adjMat = torch.eye(numNodes, device: device);
+            // Add some random edges to simulate graph structure
+            var rng = new Random(13);
+            int numEdges = 10556;
+            for (int e = 0; e < numEdges; e++) {
+                int i = rng.Next(numNodes);
+                int j = rng.Next(numNodes);
+                adjMat[i, j] = 1.0f;
+                adjMat[j, i] = 1.0f;
+            }
+
+            // Split
+            var idx = torch.randperm(numNodes, device: device);
+            var idxTrain = idx.slice(0, 1600, numNodes, 1);
+            var idxVal = idx.slice(0, 1200, 1600, 1);
+            var idxTest = idx.slice(0, 0, 1200, 1);
+
+            Console.WriteLine($"\tCreating GAT model...");
+
+            var model = new GATModel("gat", numFeatures, hiddenDim, numHeads, numClasses,
+                concat: false, dropout: 0.6, leakyReluSlope: 0.2, device: device);
+
+            var optimizer = optim.Adam(model.parameters(), lr: 0.005, weight_decay: 5e-4);
+            var criterion = NLLLoss();
+
+            Console.WriteLine($"\tTraining...");
+
+            Stopwatch totalTime = new Stopwatch();
+            totalTime.Start();
+
+            for (int epoch = 1; epoch <= epochs; epoch++) {
+                using (var d = torch.NewDisposeScope()) {
+                    model.train();
+                    optimizer.zero_grad();
+
+                    var output = model.forward(features, adjMat);
+                    var loss = criterion.forward(output.index(idxTrain), labels.index(idxTrain));
+                    loss.backward();
+                    optimizer.step();
+
+                    if (epoch % 20 == 0 || epoch == 1) {
+                        model.eval();
+                        using (torch.no_grad()) {
+                            var evalOutput = model.forward(features, adjMat);
+
+                            var trainAcc = evalOutput.index(idxTrain).argmax(1)
+                                .eq(labels.index(idxTrain)).to_type(ScalarType.Float32).mean().item<float>();
+                            var valAcc = evalOutput.index(idxVal).argmax(1)
+                                .eq(labels.index(idxVal)).to_type(ScalarType.Float32).mean().item<float>();
+
+                            Console.WriteLine($"\tEpoch {epoch:D4} | Loss: {loss.item<float>():F4} | Train Acc: {trainAcc:F4} | Val Acc: {valAcc:F4}");
+                        }
+                    }
+                }
+
+                if (totalTime.Elapsed.TotalSeconds > timeout) break;
+            }
+
+            // Final test
+            model.eval();
+            using (torch.no_grad()) {
+                var testOutput = model.forward(features, adjMat);
+                var testAcc = testOutput.index(idxTest).argmax(1)
+                    .eq(labels.index(idxTest)).to_type(ScalarType.Float32).mean().item<float>();
+                Console.WriteLine($"\tTest accuracy: {testAcc:F4}");
+            }
+
+            totalTime.Stop();
+            Console.WriteLine($"Elapsed time: {totalTime.Elapsed.TotalSeconds:F1} s.");
+        }
+    }
+}
diff --git a/src/CSharp/CSharpExamples/GCN.cs b/src/CSharp/CSharpExamples/GCN.cs
new file mode 100644
index 0000000..21fefa8
--- /dev/null
+++ b/src/CSharp/CSharpExamples/GCN.cs
@@ -0,0 +1,137 @@
+// Copyright (c) .NET Foundation and Contributors.  All Rights Reserved.  See LICENSE in the project root for license information.
+using System;
+using System.Diagnostics;
+using System.Linq;
+
+using TorchSharp;
+using TorchSharp.Examples;
+
+using static TorchSharp.torch;
+using static TorchSharp.torch.nn;
+using static TorchSharp.torch.nn.functional;
+
+namespace CSharpExamples
+{
+    /// <summary>
+    /// Graph Convolutional Network (GCN) for node classification
+    ///
+    /// Based on: https://github.com/pytorch/examples/tree/main/gcn
+    ///
+    /// Implements a 2-layer GCN for semi-supervised node classification.
+    /// Uses synthetic graph data for demonstration since the Cora dataset
+    /// requires external download infrastructure.
+    /// </summary>
+    public class GCN
+    {
+        internal static void Run(int epochs, int timeout, string logdir)
+        {
+            var device =
+                torch.cuda.is_available() ? torch.CUDA :
+                torch.mps_is_available() ? torch.MPS :
+                torch.CPU;
+
+            Console.WriteLine();
+            Console.WriteLine($"\tRunning GCN on {device.type} for {epochs} epochs, terminating after {TimeSpan.FromSeconds(timeout)}.");
+            Console.WriteLine();
+
+            torch.random.manual_seed(42);
+
+            // Create synthetic graph data for demonstration
+            // In practice, you would load a real graph dataset like Cora
+            int numNodes = 2708;
+            int numFeatures = 1433;
+            int numClasses = 7;
+            int hiddenDim = 16;
+
+            Console.WriteLine($"\tGenerating synthetic graph data...");
+            Console.WriteLine($"\t  Nodes: {numNodes}, Features: {numFeatures}, Classes: {numClasses}");
+
+            // Random features and labels
+            var features = torch.randn(numNodes, numFeatures, device: device);
+            var labels = torch.randint(numClasses, numNodes, device: device);
+
+            // Create a random sparse adjacency matrix (simulating graph structure)
+            int numEdges = 10556;
+            var edgeIdx1 = torch.randint(numNodes, numEdges, device: device);
+            var edgeIdx2 = torch.randint(numNodes, numEdges, device: device);
+            var adjMat = torch.zeros(numNodes, numNodes, device: device);
+
+            // Add edges and self-loops
+            for (int i = 0; i < numNodes; i++) {
+                adjMat[i, i] = 1.0f; // self-loops
+            }
+            // Note: In a real implementation, you'd construct the adjacency matrix properly
+            // and apply the renormalization trick D^(-1/2) A D^(-1/2)
+            // For now, use identity + random edges normalized by degree
+            adjMat = adjMat + torch.eye(numNodes, device: device) * 0.1f;
+
+            // Normalize adjacency matrix (simplified)
+            var degree = adjMat.sum(dim: 1);
+            var degreeInvSqrt = torch.sqrt(1.0f / degree);
+            degreeInvSqrt = torch.where(degreeInvSqrt.isinf(), torch.zeros_like(degreeInvSqrt), degreeInvSqrt);
+            var degreeMatrix = torch.diag(degreeInvSqrt);
+            adjMat = torch.mm(torch.mm(degreeMatrix, adjMat), degreeMatrix);
+
+            // Split into train/val/test
+            var idx = torch.randperm(numNodes, device: device);
+            var idxTrain = idx.slice(0, 1500, numNodes, 1);
+            var idxVal = idx.slice(0, 1000, 1500, 1);
+            var idxTest = idx.slice(0, 0, 1000, 1);
+
+            Console.WriteLine($"\tCreating GCN model...");
+
+            var model = new GCNModel("gcn", numFeatures, hiddenDim, numClasses,
+                useBias: true, dropoutP: 0.5, device: device);
+
+            var optimizer = optim.Adam(model.parameters(), lr: 0.01, weight_decay: 5e-4);
+            var criterion = NLLLoss();
+
+            Console.WriteLine($"\tTraining...");
+
+            Stopwatch totalTime = new Stopwatch();
+            totalTime.Start();
+
+            for (int epoch = 1; epoch <= epochs; epoch++) {
+                using (var d = torch.NewDisposeScope()) {
+                    // Training
+                    model.train();
+                    optimizer.zero_grad();
+
+                    var output = model.forward(features, adjMat);
+                    var loss = criterion.forward(output.index(idxTrain), labels.index(idxTrain));
+                    loss.backward();
+                    optimizer.step();
+
+                    if (epoch % 20 == 0 || epoch == 1) {
+                        // Evaluate
+                        model.eval();
+                        using (torch.no_grad()) {
+                            var evalOutput = model.forward(features, adjMat);
+
+                            var trainAcc = evalOutput.index(idxTrain).argmax(1)
+                                .eq(labels.index(idxTrain)).to_type(ScalarType.Float32).mean().item<float>();
+                            var valAcc = evalOutput.index(idxVal).argmax(1)
+                                .eq(labels.index(idxVal)).to_type(ScalarType.Float32).mean().item<float>();
+
+                            Console.WriteLine($"\tEpoch {epoch:D4} | Loss: {loss.item<float>():F4} | Train Acc: {trainAcc:F4} | Val Acc: {valAcc:F4}");
+                        }
+                    }
+                }
+
+                if (totalTime.Elapsed.TotalSeconds > timeout) break;
+            }
+
+            // Final test evaluation
+            model.eval();
+            using (torch.no_grad()) {
+                var testOutput = model.forward(features, adjMat);
+                var testAcc = testOutput.index(idxTest).argmax(1)
+                    .eq(labels.index(idxTest)).to_type(ScalarType.Float32).mean().item<float>();
+                Console.WriteLine($"\tTest accuracy: {testAcc:F4}");
+            }
+
+            totalTime.Stop();
+            Console.WriteLine($"Elapsed time: {totalTime.Elapsed.TotalSeconds:F1} s.");
+        }
+    }
+}
diff --git a/src/CSharp/CSharpExamples/Program.cs b/src/CSharp/CSharpExamples/Program.cs
index 57c81a4..c69a6c9 100644
--- a/src/CSharp/CSharpExamples/Program.cs
+++ b/src/CSharp/CSharpExamples/Program.cs
@@ -78,6 +78,45 @@ static void Main(string[] args)
                         MNISTRnn.Run(epochs, timeout, logdir);
                         break;
 
+                    case "super-resolution":
+                        SuperResolution.Run(epochs, timeout, logdir);
+                        break;
+
+                    case "forward-forward":
+                        ForwardForward.Run(epochs, timeout, logdir);
+                        break;
+
+                    case "siamese":
+                        SiameseNetwork.Run(epochs, timeout, logdir);
+                        break;
+
+                    case "gcn":
+                        GCN.Run(epochs, timeout, logdir);
+                        break;
+
+                    case "gat":
+                        GAT.Run(epochs, timeout, logdir);
+                        break;
+
+                    case "time-seq":
+                        TimeSequencePrediction.Run(epochs, timeout, logdir);
+                        break;
+
+                    case "wlm-lstm":
+                    case "wlm-gru":
+                    case "wlm-rnn-tanh":
+                    case "wlm-rnn-relu":
+                        var rnnType = argumentParser[idx].ToLower() switch
+                        {
+                            "wlm-lstm" => "LSTM",
+                            "wlm-gru" => "GRU",
+                            "wlm-rnn-tanh" => "RNN_TANH",
+                            "wlm-rnn-relu" => "RNN_RELU",
+                            _ => "LSTM"
+                        };
+                        WordLanguageModel.Run(rnnType, epochs, timeout, logdir);
+                        break;
+
                     default:
                         Console.Error.WriteLine($"Unknown model name: {argumentParser[idx]}");
                         break;
diff --git a/src/CSharp/CSharpExamples/SiameseNetwork.cs b/src/CSharp/CSharpExamples/SiameseNetwork.cs
new file mode 100644
index 0000000..3e5aff5
--- /dev/null
+++ b/src/CSharp/CSharpExamples/SiameseNetwork.cs
@@ -0,0 +1,204 @@
+// Copyright (c) .NET Foundation and Contributors.  All Rights Reserved.  See LICENSE in the project root for license information.
+using System;
+using System.IO;
+using System.Collections.Generic;
+using System.Diagnostics;
+
+using TorchSharp;
+using static TorchSharp.torchvision;
+
+using TorchSharp.Examples;
+using TorchSharp.Examples.Utils;
+
+using static TorchSharp.torch;
+using static TorchSharp.torch.nn;
+using static TorchSharp.torch.nn.functional;
+
+namespace CSharpExamples
+{
+    /// <summary>
+    /// Siamese Network for image similarity
+    ///
+    /// Based on: https://github.com/pytorch/examples/tree/main/siamese_network
+    ///
+    /// Trains a Siamese network to determine if two MNIST images are from the
+    /// same class or different classes. Uses BCELoss for training.
+    /// </summary>
+    public class SiameseNetwork
+    {
+        private static int _trainBatchSize = 64;
+        private static int _testBatchSize = 128;
+        private readonly static int _logInterval = 100;
+
+        internal static void Run(int epochs, int timeout, string logdir)
+        {
+            var device =
+                torch.cuda.is_available() ? torch.CUDA :
+                torch.mps_is_available() ? torch.MPS :
+                torch.CPU;
+
+            Console.WriteLine();
+            Console.WriteLine($"\tRunning Siamese Network on {device.type} for {epochs} epochs, terminating after {TimeSpan.FromSeconds(timeout)}.");
+            Console.WriteLine();
+
+            torch.random.manual_seed(1);
+
+            var dataset = "mnist";
+            var datasetPath = Path.Join(Environment.GetFolderPath(Environment.SpecialFolder.DesktopDirectory), "..", "Downloads", dataset);
+
+            var sourceDir = datasetPath;
+            var targetDir = Path.Combine(datasetPath, "test_data");
+
+            if (!Directory.Exists(targetDir)) {
+                Directory.CreateDirectory(targetDir);
+                Decompress.DecompressGZipFile(Path.Combine(sourceDir, "train-images-idx3-ubyte.gz"), targetDir);
+                Decompress.DecompressGZipFile(Path.Combine(sourceDir, "train-labels-idx1-ubyte.gz"), targetDir);
+                Decompress.DecompressGZipFile(Path.Combine(sourceDir, "t10k-images-idx3-ubyte.gz"), targetDir);
+                Decompress.DecompressGZipFile(Path.Combine(sourceDir, "t10k-labels-idx1-ubyte.gz"), targetDir);
+            }
+
+            Console.WriteLine($"\tCreating the model...");
+
+            var model = new SiameseNetworkModel("siamese", device);
+            var optimizer = optim.Adadelta(model.parameters(), lr: 1.0);
+            var scheduler = optim.lr_scheduler.StepLR(optimizer, 1, 0.7);
+
+            Console.WriteLine($"\tPreparing training and test data...");
+            Console.WriteLine();
+
+            using (MNISTReader train = new MNISTReader(targetDir, "train", _trainBatchSize, device: device, shuffle: true),
+                               test = new MNISTReader(targetDir, "t10k", _testBatchSize, device: device))
+            {
+                Stopwatch totalTime = new Stopwatch();
+                totalTime.Start();
+
+                for (var epoch = 1; epoch <= epochs; epoch++) {
+                    Train(model, optimizer, device, train, epoch, train.Size);
+                    Test(model, device, test, epoch, test.Size);
+                    scheduler.step();
+
+                    Console.WriteLine($"End-of-epoch memory use: {GC.GetTotalMemory(false)}");
+
+                    if (totalTime.Elapsed.TotalSeconds > timeout) break;
+                }
+
+                totalTime.Stop();
+                Console.WriteLine($"Elapsed time: {totalTime.Elapsed.TotalSeconds:F1} s.");
+            }
+        }
+
+        /// <summary>
+        /// Creates pairs of images from the same dataset for Siamese training.
+        /// Even indices create same-class pairs (label=1), odd create different-class pairs (label=0).
+        /// </summary>
+        private static (Tensor, Tensor, Tensor) CreatePairs(Tensor data, Tensor labels, int batchIdx)
+        {
+            var rng = new Random(batchIdx);
+            int batchSize = (int)data.shape[0];
+
+            var images1 = new List<Tensor>();
+            var images2 = new List<Tensor>();
+            var targets = new List<float>();
+
+            for (int i = 0; i < batchSize; i++) {
+                images1.Add(data[i].unsqueeze(0));
+
+                if (i % 2 == 0) {
+                    // Same class pair
+                    var sameLabel = labels[i].item<long>();
+                    // Find another image with the same label
+                    int j = rng.Next(batchSize);
+                    int attempts = 0;
+                    while (labels[j].item<long>() != sameLabel && attempts < batchSize) {
+                        j = rng.Next(batchSize);
+                        attempts++;
+                    }
+                    images2.Add(data[j].unsqueeze(0));
+                    targets.Add(1.0f);
+                } else {
+                    // Different class pair
+                    var thisLabel = labels[i].item<long>();
+                    int j = rng.Next(batchSize);
+                    int attempts = 0;
+                    while (labels[j].item<long>() == thisLabel && attempts < batchSize) {
+                        j = rng.Next(batchSize);
+                        attempts++;
+                    }
+                    images2.Add(data[j].unsqueeze(0));
+                    targets.Add(0.0f);
+                }
+            }
+
+            var img1 = torch.cat(images1.ToArray(), dim: 0);
+            var img2 = torch.cat(images2.ToArray(), dim: 0);
+            var tgt = torch.tensor(targets.ToArray());
+
+            return (img1, img2, tgt);
+        }
+
+        private static void Train(
+            SiameseNetworkModel model,
+            optim.Optimizer optimizer,
+            Device device,
+            IEnumerable<(Tensor, Tensor)> dataLoader,
+            int epoch,
+            int size)
+        {
+            model.train();
+            var criterion = BCELoss();
+            int batchIdx = 0;
+
+            foreach (var (data, labels) in dataLoader) {
+                using (var d = torch.NewDisposeScope()) {
+                    var (images1, images2, targets) = CreatePairs(data, labels, batchIdx);
+                    targets = targets.to(device);
+
+                    optimizer.zero_grad();
+                    var outputs = model.forward(images1, images2).squeeze();
+                    var loss = criterion.forward(outputs, targets);
+                    loss.backward();
+                    optimizer.step();
+
+                    if (batchIdx % _logInterval == 0) {
+                        Console.WriteLine($"\tTrain Epoch: {epoch} [{batchIdx * _trainBatchSize}/{size}] Loss: {loss.item<float>():F6}");
+                    }
+                    batchIdx++;
+                }
+            }
+        }
+
+        private static void Test(
+            SiameseNetworkModel model,
+            Device device,
+            IEnumerable<(Tensor, Tensor)> dataLoader,
+            int epoch,
+            int size)
+        {
+            model.eval();
+            double testLoss = 0;
+            int correct = 0;
+            int total = 0;
+            var criterion = BCELoss();
+
+            using (torch.no_grad()) {
+                int batchIdx = 0;
+                foreach (var (data, labels) in dataLoader) {
+                    using (var d = torch.NewDisposeScope()) {
+                        var (images1, images2, targets) = CreatePairs(data, labels, batchIdx + 10000);
+                        targets = targets.to(device);
+
+                        var outputs = model.forward(images1, images2).squeeze();
+                        testLoss += criterion.forward(outputs, targets).item<float>();
+
+                        var pred = torch.where(outputs > 0.5, 1, 0);
+                        correct += pred.eq(targets.to_type(ScalarType.Int32).view_as(pred)).sum().item<int>();
+                        total += (int)targets.shape[0];
+                        batchIdx++;
+                    }
+                }
+            }
+
+            Console.WriteLine($"====> Test set: Average loss: {testLoss / total:F4}, Accuracy: {correct}/{total} ({100.0 * correct / total:F0}%)");
+        }
+    }
+}
diff --git a/src/CSharp/CSharpExamples/SuperResolution.cs b/src/CSharp/CSharpExamples/SuperResolution.cs
new file mode 100644
index 0000000..21123b4
--- /dev/null
+++ b/src/CSharp/CSharpExamples/SuperResolution.cs
@@ -0,0 +1,150 @@
+// Copyright (c) .NET Foundation and Contributors.  All Rights Reserved.  See LICENSE in the project root for license information.
+using System;
+using System.IO;
+using System.Collections.Generic;
+using System.Diagnostics;
+
+using TorchSharp;
+using static TorchSharp.torchvision;
+
+using TorchSharp.Examples;
+using TorchSharp.Examples.Utils;
+
+using static TorchSharp.torch;
+using static TorchSharp.torch.nn;
+using static TorchSharp.torch.nn.functional;
+
+namespace CSharpExamples
+{
+    /// <summary>
+    /// Super-Resolution using ESPCN (Efficient Sub-Pixel Convolutional Neural Network)
+    ///
+    /// Based on: https://github.com/pytorch/examples/tree/main/super_resolution
+    ///
+    /// Trains a model to upscale low-resolution images using the sub-pixel convolution
+    /// technique (PixelShuffle). Uses MNIST as a simple dataset for demonstration.
+    /// </summary>
+    public class SuperResolution
+    {
+        private static int _trainBatchSize = 64;
+        private static int _testBatchSize = 64;
+        private static int _upscaleFactor = 2;
+        private readonly static int _logInterval = 100;
+
+        internal static void Run(int epochs, int timeout, string logdir)
+        {
+            var device =
+                torch.cuda.is_available() ? torch.CUDA :
+                torch.mps_is_available() ? torch.MPS :
+                torch.CPU;
+
+            Console.WriteLine();
+            Console.WriteLine($"\tRunning SuperResolution on {device.type} for {epochs} epochs, terminating after {TimeSpan.FromSeconds(timeout)}.");
+            Console.WriteLine();
+
+            torch.random.manual_seed(1);
+
+            var dataset = "mnist";
+            var datasetPath = Path.Join(Environment.GetFolderPath(Environment.SpecialFolder.DesktopDirectory), "..", "Downloads", dataset);
+
+            var sourceDir = datasetPath;
+            var targetDir = Path.Combine(datasetPath, "test_data");
+
+            if (!Directory.Exists(targetDir)) {
+                Directory.CreateDirectory(targetDir);
+                Decompress.DecompressGZipFile(Path.Combine(sourceDir, "train-images-idx3-ubyte.gz"), targetDir);
+                Decompress.DecompressGZipFile(Path.Combine(sourceDir, "train-labels-idx1-ubyte.gz"), targetDir);
+                Decompress.DecompressGZipFile(Path.Combine(sourceDir, "t10k-images-idx3-ubyte.gz"), targetDir);
+                Decompress.DecompressGZipFile(Path.Combine(sourceDir, "t10k-labels-idx1-ubyte.gz"), targetDir);
+            }
+
+            Console.WriteLine($"\tCreating the model...");
+
+            var model = new SuperResolutionModel("super_resolution", _upscaleFactor, device);
+            var optimizer = optim.Adam(model.parameters(), lr: 1e-3);
+            var loss = MSELoss();
+
+            Console.WriteLine($"\tPreparing training and test data...");
+            Console.WriteLine();
+
+            using (MNISTReader train = new MNISTReader(targetDir, "train", _trainBatchSize, device: device, shuffle: true),
+                               test = new MNISTReader(targetDir, "t10k", _testBatchSize, device: device))
+            {
+                Stopwatch totalTime = new Stopwatch();
+                totalTime.Start();
+
+                for (var epoch = 1; epoch <= epochs; epoch++) {
+                    Train(model, optimizer, loss, device, train, epoch, train.Size);
+                    Test(model, loss, device, test, epoch, test.Size);
+
+                    Console.WriteLine($"End-of-epoch memory use: {GC.GetTotalMemory(false)}");
+
+                    if (totalTime.Elapsed.TotalSeconds > timeout) break;
+                }
+
+                totalTime.Stop();
+                Console.WriteLine($"Elapsed time: {totalTime.Elapsed.TotalSeconds:F1} s.");
+            }
+        }
+
+        private static void Train(
+            SuperResolutionModel model,
+            optim.Optimizer optimizer,
+            Loss<Tensor, Tensor, Tensor> lossFn,
+            Device device,
+            IEnumerable<(Tensor, Tensor)> dataLoader,
+            int epoch,
+            int size)
+        {
+            model.train();
+            int batchIdx = 0;
+
+            foreach (var (data, _) in dataLoader) {
+                using (var d = torch.NewDisposeScope()) {
+                    // Use the original image as target, downsample as input
+                    var target = data;
+                    // Simple downscale by average pooling, then upscale back
+                    var input = avg_pool2d(data, _upscaleFactor);
+
+                    optimizer.zero_grad();
+                    var output = model.forward(input);
+                    var loss = lossFn.forward(output, target);
+                    loss.backward();
+                    optimizer.step();
+
+                    if (batchIdx % _logInterval == 0) {
+                        Console.WriteLine($"\tTrain Epoch: {epoch} [{batchIdx * _trainBatchSize}/{size}] Loss: {loss.item<float>():F6}");
+                    }
+                    batchIdx++;
+                }
+            }
+        }
+
+        private static void Test(
+            SuperResolutionModel model,
+            Loss<Tensor, Tensor, Tensor> lossFn,
+            Device device,
+            IEnumerable<(Tensor, Tensor)> dataLoader,
+            int epoch,
+            int size)
+        {
+            model.eval();
+            double testLoss = 0;
+            int batches = 0;
+
+            using (torch.no_grad()) {
+                foreach (var (data, _) in dataLoader) {
+                    using (var d = torch.NewDisposeScope()) {
+                        var target = data;
+                        var input = avg_pool2d(data, _upscaleFactor);
+                        var output = model.forward(input);
+                        testLoss += lossFn.forward(output, target).item<float>();
+                        batches++;
+                    }
+                }
+            }
+
+            Console.WriteLine($"====> Epoch {epoch}: Average test loss: {testLoss / batches:F6}");
+        }
+    }
+}
diff --git a/src/CSharp/CSharpExamples/TimeSequencePrediction.cs b/src/CSharp/CSharpExamples/TimeSequencePrediction.cs
new file mode 100644
index 0000000..e80b98f
--- /dev/null
+++ b/src/CSharp/CSharpExamples/TimeSequencePrediction.cs
@@ -0,0 +1,136 @@
+// Copyright (c) .NET Foundation and Contributors.  All Rights Reserved.  See LICENSE in the project root for license information.
+using System;
+using System.Diagnostics;
+
+using TorchSharp;
+using TorchSharp.Examples;
+
+using static TorchSharp.torch;
+using static TorchSharp.torch.nn;
+
+namespace CSharpExamples
+{
+    /// <summary>
+    /// Time Sequence Prediction using LSTM
+    ///
+    /// Based on: https://github.com/pytorch/examples/tree/main/time_sequence_prediction
+    ///
+    /// Generates sine wave data with random phase shifts, trains a stacked LSTMCell model
+    /// to predict the next value, and then predicts future values beyond the training data.
+    /// Uses synthetic data — no dataset download needed.
+    /// </summary>
+    public class TimeSequencePrediction
+    {
+        private const int T = 20;
+        private const int L = 1000;
+        private const int N = 100;
+
+        internal static void Run(int epochs, int timeout, string logdir)
+        {
+            var device =
+                torch.cuda.is_available() ? torch.CUDA :
+                torch.mps_is_available() ? torch.MPS :
+                torch.CPU;
+
+            Console.WriteLine();
+            Console.WriteLine($"\tRunning TimeSequencePrediction on {device.type.ToString()} for {epochs} epochs, terminating after {TimeSpan.FromSeconds(timeout)}.");
+            Console.WriteLine();
+
+            torch.random.manual_seed(0);
+
+            // Generate sine wave training data (matching PyTorch's generate_sine_wave.py)
+            Console.WriteLine($"\tGenerating sine wave training data...");
+            var data = GenerateSineWaveData();
+
+            var input = data[TensorIndex.Slice(3, null), TensorIndex.Slice(null, -1)];
+            var target = data[TensorIndex.Slice(3, null), TensorIndex.Slice(1, null)];
+            var test_input = data[TensorIndex.Slice(null, 3), TensorIndex.Slice(null, -1)];
+            var test_target = data[TensorIndex.Slice(null, 3), TensorIndex.Slice(1, null)];
+
+            // Move to device
+            input = input.to(device);
+            target = target.to(device);
+            test_input = test_input.to(device);
+            test_target = test_target.to(device);
+
+            Console.WriteLine($"\tCreating the model...");
+            Console.WriteLine();
+
+            var model = new SequenceModel("time-seq", device);
+            model.to(torch.float64);
+
+            var criterion = MSELoss();
+            var optimizer = torch.optim.LBFGS(model.parameters(), lr: 0.8);
+
+            var writer = String.IsNullOrEmpty(logdir) ? null : torch.utils.tensorboard.SummaryWriter(logdir, createRunName: true);
+
+            Stopwatch totalTime = new Stopwatch();
+            totalTime.Start();
+
+            for (var epoch = 0; epoch < epochs; epoch++)
+            {
+                using (var d = torch.NewDisposeScope())
+                {
+                    Console.WriteLine($"STEP: {epoch}");
+
+                    // Training step with LBFGS closure
+                    Tensor lastLoss = null;
+
+                    Tensor closure()
+                    {
+                        optimizer.zero_grad();
+                        var output = model.forward(input, 0);
+                        var loss = criterion.forward(output, target);
+                        Console.WriteLine($"\tloss: {loss.item<double>():F6}");
+                        loss.backward();
+                        lastLoss = loss;
+                        return loss;
+                    }
+
+                    optimizer.step(closure);
+
+                    // Test: predict with future steps
+                    using (torch.no_grad())
+                    {
+                        var future = 1000;
+                        var pred = model.forward(test_input, future);
+                        var loss = criterion.forward(pred[TensorIndex.Colon, TensorIndex.Slice(null, -future)], test_target);
+                        Console.WriteLine($"\ttest loss: {loss.item<double>():F6}");
+
+                        if (writer != null)
+                        {
+                            writer.add_scalar("time_seq/train_loss", (float)lastLoss.item<double>(), epoch);
+                            writer.add_scalar("time_seq/test_loss", (float)loss.item<double>(), epoch);
+                        }
+                    }
+
+                    if (totalTime.Elapsed.TotalSeconds > timeout) break;
+                }
+            }
+
+            totalTime.Stop();
+            Console.WriteLine($"\nElapsed time: {totalTime.Elapsed.TotalSeconds:F1} s.");
+        }
+
+        /// <summary>
+        /// Generates sine wave data matching PyTorch's generate_sine_wave.py.
+        /// Creates N sine waves of length L with random phase offsets.
+        /// </summary>
+        private static Tensor GenerateSineWaveData()
+        {
+            var rng = new Random(2);
+            var x = new double[N, L];
+
+            for (int i = 0; i < N; i++)
+            {
+                var offset = rng.Next(-4 * T, 4 * T);
+                for (int j = 0; j < L; j++)
+                {
+                    x[i, j] = Math.Sin((j + offset) / (double)T);
+                }
+            }
+
+            return torch.tensor(x, dtype: torch.float64);
+        }
+    }
+}
diff --git a/src/CSharp/CSharpExamples/WordLanguageModel.cs b/src/CSharp/CSharpExamples/WordLanguageModel.cs
new file mode 100644
index 0000000..fa151f0
--- /dev/null
+++ b/src/CSharp/CSharpExamples/WordLanguageModel.cs
@@ -0,0 +1,245 @@
+// Copyright (c) .NET Foundation and Contributors.  All Rights Reserved.  See LICENSE in the project root for license information.
+using System;
+using System.IO;
+using System.Linq;
+using System.Collections.Generic;
+using System.Diagnostics;
+
+using TorchSharp;
+using TorchSharp.Examples;
+using TorchSharp.Examples.Utils;
+
+using static TorchSharp.torch;
+using static TorchSharp.torch.nn;
+using static TorchSharp.torch.nn.functional;
+
+namespace CSharpExamples
+{
+    /// <summary>
+    /// Word-level Language Model using RNN (LSTM/GRU/RNN)
+    ///
+    /// Based on: https://github.com/pytorch/examples/tree/main/word_language_model
+    ///
+    /// Trains a word-level language model on WikiText-2 using an RNN (LSTM, GRU, or vanilla RNN).
+    /// This complements the existing SequenceToSequence example which uses a Transformer.
+    ///
+    /// WikiText-2 dataset available at:
+    /// https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip
+    /// </summary>
+    public class WordLanguageModel
+    {
+        private readonly static string _dataLocation = Path.Join(Environment.GetFolderPath(Environment.SpecialFolder.DesktopDirectory), "..", "Downloads", "wikitext-2-v1");
+
+        private const long emsize = 200;
+        private const long nhid = 200;
+        private const long nlayers = 2;
+        private const double dropout = 0.2;
+
+        private const int batch_size = 20;
+        private const int eval_batch_size = 10;
+        private const int bptt = 35;
+
+        internal static void Run(string rnnType, int epochs, int timeout, string logdir)
+        {
+            torch.random.manual_seed(1111);
+
+            var device =
+                torch.cuda.is_available() ? torch.CUDA :
+                torch.mps_is_available() ? torch.MPS :
+                torch.CPU;
+
+            Console.WriteLine();
+            Console.WriteLine($"\tRunning WordLanguageModel ({rnnType}) on {device.type.ToString()} for {epochs} epochs, terminating after {TimeSpan.FromSeconds(timeout)}.");
+            Console.WriteLine();
+
+            Console.WriteLine($"\tPreparing training and test data...");
+
+            var vocab_iter = TorchText.Datasets.WikiText2("train", _dataLocation);
+            var tokenizer = TorchText.Data.Utils.get_tokenizer("basic_english");
+
+            var counter = new TorchText.Vocab.Counter<string>();
+            foreach (var item in vocab_iter)
+            {
+                counter.update(tokenizer(item));
+            }
+
+            var vocab = new TorchText.Vocab.Vocab(counter);
+
+            var (train_iter, valid_iter, test_iter) = TorchText.Datasets.WikiText2(_dataLocation);
+
+            var train_data = Batchify(ProcessInput(train_iter, tokenizer, vocab), batch_size).to((Device)device);
+            var valid_data = Batchify(ProcessInput(valid_iter, tokenizer, vocab), eval_batch_size).to((Device)device);
+            var test_data = Batchify(ProcessInput(test_iter, tokenizer, vocab), eval_batch_size).to((Device)device);
+
+            var ntokens = vocab.Count;
+
+            Console.WriteLine($"\tVocabulary size: {ntokens}");
+            Console.WriteLine($"\tCreating the {rnnType} model...");
+            Console.WriteLine();
+
+            var model = new RNNModel(rnnType, ntokens, emsize, nhid, nlayers, dropout);
+            model.to((Device)device);
+
+            var criterion = NLLLoss();
+            var lr = 20.0;
+
+            var writer = String.IsNullOrEmpty(logdir) ? null : torch.utils.tensorboard.SummaryWriter(logdir, createRunName: true);
+
+            var totalTime = new Stopwatch();
+            totalTime.Start();
+
+            double? best_val_loss = null;
+
+            for (var epoch = 1; epoch <= epochs; epoch++)
+            {
+                var sw = new Stopwatch();
+                sw.Start();
+
+                Train(epoch, train_data, model, criterion, ntokens, lr, device);
+
+                var val_loss = Evaluate(valid_data, model, criterion, ntokens, device);
+                sw.Stop();
+
+                Console.WriteLine($"\nEnd of epoch: {epoch} | lr: {lr:0.00} | time: {sw.Elapsed.TotalSeconds:0.0}s | valid loss: {val_loss:0.00} | valid ppl: {Math.Exp(val_loss):0.00}\n");
+
+                if (writer != null)
+                {
+                    writer.add_scalar("wlm/valid_loss", (float)val_loss, epoch);
+                    writer.add_scalar("wlm/valid_ppl", (float)Math.Exp(val_loss), epoch);
+                }
+
+                // Save best model and anneal learning rate
+                if (best_val_loss == null || val_loss < best_val_loss.Value)
+                {
+                    best_val_loss = val_loss;
+                }
+                else
+                {
+                    // Anneal the learning rate if no improvement
+                    lr /= 4.0;
+                }
+
+                if (totalTime.Elapsed.TotalSeconds > timeout) break;
+            }
+
+            var test_loss = Evaluate(test_data, model, criterion, ntokens, device);
+            totalTime.Stop();
+
+            Console.WriteLine($"\nEnd of training | time: {totalTime.Elapsed.TotalSeconds:0.0}s | test loss: {test_loss:0.00} | test ppl: {Math.Exp(test_loss):0.00}\n");
+        }
+
+        private static void Train(int epoch, Tensor train_data, RNNModel model, Loss<Tensor, Tensor, Tensor> criterion, int ntokens, double lr, Device device)
+        {
+            model.train();
+            var total_loss = 0.0f;
+            var log_interval = 200;
+
+            var hidden = model.InitHidden(batch_size, device);
+
+            using (var d = torch.NewDisposeScope())
+            {
+                var batch = 0;
+
+                for (int i = 0; i < train_data.shape[0] - 1; batch++, i += bptt)
+                {
+                    var (data, targets) = GetBatch(train_data, i);
+
+                    // Detach hidden state from history
+                    hidden = hidden.detach();
+
+                    model.zero_grad();
+
+                    var (output, newHidden) = model.forward(data, hidden);
+                    hidden = newHidden;
+
+                    var loss = criterion.forward(output.view(-1, ntokens), targets);
+                    loss.backward();
+
+                    // Clip gradients to prevent exploding gradients
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), 0.25);
+
+                    // Manual SGD update (matching PyTorch example default)
+                    using (torch.no_grad())
+                    {
+                        foreach (var p in model.parameters())
+                        {
+                            p.add_(p.grad, alpha: (float)(-lr));
+                        }
+                    }
+
+                    total_loss += loss.to(torch.CPU).item<float>();
+
+                    if (batch % log_interval == 0 && batch > 0)
+                    {
+                        var cur_loss = total_loss / log_interval;
+                        Console.WriteLine($"| epoch {epoch,3} | {batch,5}/{train_data.shape[0] / bptt,5} batches | lr {lr:0.00} | loss {cur_loss:0.00} | ppl {Math.Exp(cur_loss):0.00}");
+                        total_loss = 0;
+                    }
+
+                    d.DisposeEverythingBut(hidden);
+                }
+            }
+        }
+
+        private static double Evaluate(Tensor eval_data, RNNModel model, Loss<Tensor, Tensor, Tensor> criterion, int ntokens, Device device)
+        {
+            model.eval();
+
+            var total_loss = 0.0f;
+            var hidden = model.InitHidden(eval_batch_size, device);
+
+            using (var d = torch.NewDisposeScope())
+            {
+                var batch = 0;
+                for (int i = 0; i < eval_data.shape[0] - 1; batch++, i += bptt)
+                {
+                    var (data, targets) = GetBatch(eval_data, i);
+
+                    hidden = hidden.detach();
+
+                    var (output, newHidden) = model.forward(data, hidden);
+                    hidden = newHidden;
+
+                    var loss = criterion.forward(output.view(-1, ntokens), targets);
+                    total_loss += data.shape[0] * loss.to(torch.CPU).item<float>();
+
+                    d.DisposeEverythingBut(hidden);
+                }
+            }
+
+            return total_loss / eval_data.shape[0];
+        }
+
+        static Tensor ProcessInput(IEnumerable<string> iter, Func<string, IEnumerable<string>> tokenizer, TorchText.Vocab.Vocab vocab)
+        {
+            List<Tensor> data = new List<Tensor>();
+            foreach (var item in iter)
+            {
+                List<long> itemData = new List<long>();
+                foreach (var token in tokenizer(item))
+                {
+                    itemData.Add(vocab[token]);
+                }
+                data.Add(torch.tensor(itemData.ToArray(), torch.int64));
+            }
+
+            var result = torch.cat(data.Where(t => t.NumberOfElements > 0).ToList(), 0);
+            return result;
+        }
+
+        static Tensor Batchify(Tensor data, int batch_size)
+        {
+            var nbatch = data.shape[0] / batch_size;
+            using var d2 = data.narrow(0, 0, nbatch * batch_size).view(batch_size, -1).t();
+            return d2.contiguous();
+        }
+
+        static (Tensor, Tensor) GetBatch(Tensor source, int index)
+        {
+            var len = Math.Min(bptt, (int)(source.shape[0] - 1 - index));
+            var data = source[TensorIndex.Slice(index, index + len)];
+            var target = source[TensorIndex.Slice(index + 1, index + 1 + len)].reshape(-1);
+            return (data, target);
+        }
+    }
+}
diff --git a/src/CSharp/Models/ForwardForward.cs b/src/CSharp/Models/ForwardForward.cs
new file mode 100644
index 0000000..cf0eed9
--- /dev/null
+++ b/src/CSharp/Models/ForwardForward.cs
@@ -0,0 +1,161 @@
+// Copyright (c) .NET Foundation and Contributors.  All Rights Reserved.  See LICENSE in the project root for license information.
+using System;
+using System.Collections.Generic;
+
+using static TorchSharp.torch;
+using static TorchSharp.torch.nn;
+using static TorchSharp.torch.nn.functional;
+
+namespace TorchSharp.Examples
+{
+    /// <summary>
+    /// Forward-Forward MNIST model based on: https://github.com/pytorch/examples/tree/main/mnist_forward_forward
+    ///
+    /// Implements the Forward-Forward algorithm by Geoffrey Hinton.
+    /// Instead of backpropagation, each layer is trained independently using a local loss
+    /// that encourages high "goodness" for positive examples and low for negative ones.
+    /// </summary>
+    public class ForwardForwardLayer : Module<Tensor, Tensor>
+    {
+        private Modules.Linear linear;
+        private Module<Tensor, Tensor> relu = ReLU();
+        private double threshold;
+
+        public ForwardForwardLayer(string name, int inFeatures, int outFeatures, double threshold = 2.0, torch.Device device = null) : base(name)
+        {
+            linear = Linear(inFeatures, outFeatures);
+            this.threshold = threshold;
+
+            RegisterComponents();
+
+            if (device != null && device.type != DeviceType.CPU)
+                this.to(device);
+        }
+
+        public override Tensor forward(Tensor x)
+        {
+            var xDirection = x / (x.norm(1, keepdim: true, p: 2.0f) + 1e-4);
+            return relu.forward(torch.mm(xDirection, linear.weight.t()) + linear.bias.unsqueeze(0));
+        }
+
+        /// <summary>
+        /// Train this layer using the Forward-Forward algorithm.
+        /// Returns detached outputs for positive and negative examples to pass to the next layer.
+        /// </summary>
+        public (Tensor, Tensor) TrainLayer(Tensor xPos, Tensor xNeg, int numEpochs, double lr, int logInterval = 10)
+        {
+            var opt = optim.Adam(this.parameters(), lr: lr);
+
+            for (int i = 0; i < numEpochs; i++) {
+                using var d = torch.NewDisposeScope();
+
+                var gPos = this.forward(xPos).pow(2).mean(new long[] { 1 });
+                var gNeg = this.forward(xNeg).pow(2).mean(new long[] { 1 });
+
+                // Loss: log(1 + exp(-gPos + threshold)) + log(1 + exp(gNeg - threshold))
+                var loss = torch.log1p(
+                    torch.exp(
+                        torch.cat(new Tensor[] {
+                            -gPos + threshold,
+                            gNeg - threshold
+                        })
+                    )
+                ).mean();
+
+                opt.zero_grad();
+                loss.backward();
+                opt.step();
+
+                if (i % logInterval == 0) {
+                    Console.WriteLine($"\t\tLoss: {loss.item<float>():F4}");
+                }
+
+                d.DisposeEverythingBut(gPos, gNeg);
+            }
+
+            return (this.forward(xPos).detach(), this.forward(xNeg).detach());
+        }
+    }
+
+    /// <summary>
+    /// Forward-Forward network composed of multiple independently-trained layers.
+    /// </summary>
+    public class ForwardForwardNet
+    {
+        private List<ForwardForwardLayer> layers = new List<ForwardForwardLayer>();
+        private torch.Device device;
+
+        public ForwardForwardNet(int[] dims, torch.Device device = null)
+        {
+            this.device = device ?? torch.CPU;
+            for (int i = 0; i < dims.Length - 1; i++) {
+                layers.Add(new ForwardForwardLayer($"ff_layer_{i}", dims[i], dims[i + 1], device: this.device));
+            }
+        }
+
+        /// <summary>
+        /// Overlay label information onto the input data (first 10 pixels).
+        /// </summary>
+        public static Tensor OverlayLabelOnInput(Tensor x, Tensor y, int numClasses = 10)
+        {
+            var x_ = x.clone();
+            x_[TensorIndex.Colon, TensorIndex.Slice(null, numClasses)] *= 0.0f;
+            for (int i = 0; i < x_.shape[0]; i++) {
+                x_[i, y[i].item<long>()] = x.max();
+            }
+            return x_;
+        }
+
+        /// <summary>
+        /// Generate negative labels (different from the true labels).
+        /// </summary>
+        public static Tensor GetNegativeLabels(Tensor y)
+        {
+            var yNeg = y.clone();
+            var rng = new Random();
+            for (int i = 0; i < y.shape[0]; i++) {
+                var trueLabel = y[i].item<long>();
+                long newLabel;
+                do {
+                    newLabel = rng.Next(10);
+                } while (newLabel == trueLabel);
+                yNeg[i] = torch.tensor(newLabel);
+            }
+            return yNeg;
+        }
+
+        /// <summary>
+        /// Train all layers sequentially using the Forward-Forward algorithm.
+        /// </summary>
+        public void Train(Tensor xPos, Tensor xNeg, int numEpochs, double lr, int logInterval = 10)
+        {
+            var hPos = xPos;
+            var hNeg = xNeg;
+            for (int i = 0; i < layers.Count; i++) {
+                Console.WriteLine($"\tTraining layer {i}...");
+                (hPos, hNeg) = layers[i].TrainLayer(hPos, hNeg, numEpochs, lr, logInterval);
+            }
+        }
+
+        /// <summary>
+        /// Predict by measuring total "goodness" for each possible label.
+        /// </summary>
+        public Tensor Predict(Tensor x)
+        {
+            var goodnessList = new List<Tensor>();
+
+            for (int label = 0; label < 10; label++) {
+                var h = OverlayLabelOnInput(x, torch.full(x.shape[0], label, dtype: ScalarType.Int64, device: device));
+                var goodness = torch.tensor(0.0f, device: device);
+                foreach (var layer in layers) {
+                    h = layer.forward(h);
+                    goodness = goodness + h.pow(2).mean(new long[] { 1 });
+                }
+                goodnessList.Add(goodness.unsqueeze(1));
+            }
+
+            var goodnessPerLabel = torch.cat(goodnessList.ToArray(), 1);
+            return goodnessPerLabel.argmax(1);
+        }
+    }
+}
diff --git a/src/CSharp/Models/GAT.cs b/src/CSharp/Models/GAT.cs
new file mode 100644
index 0000000..141886a
--- /dev/null
+++ b/src/CSharp/Models/GAT.cs
@@ -0,0 +1,141 @@
+// Copyright (c) .NET Foundation and Contributors.  All Rights Reserved.  See LICENSE in the project root for license information.
+using System;
+
+using static TorchSharp.torch;
+using static TorchSharp.torch.nn;
+using static TorchSharp.torch.nn.functional;
+
+namespace TorchSharp.Examples
+{
+    /// <summary>
+    /// Graph Attention Layer as described in "Graph Attention Networks" (https://arxiv.org/pdf/1710.10903.pdf).
+    ///
+    /// Computes attention coefficients for each edge in the graph, then aggregates neighbor features
+    /// using these attention weights.
+    /// </summary>
+    public class GraphAttentionLayer : Module<Tensor, Tensor, Tensor>
+    {
+        private readonly int nHeads;
+        private readonly int nHidden;
+        private readonly int outFeatures;
+        private readonly bool concat;
+        private readonly double dropoutRate;
+
+        private Modules.Parameter W;
+        private Modules.Parameter a;
+        private Module<Tensor, Tensor> leakyrelu;
+
+        public GraphAttentionLayer(string name, int inFeatures, int outFeatures, int nHeads,
+            bool concat = false, double dropout = 0.4, double leakyReluSlope = 0.2) : base(name)
+        {
+            this.nHeads = nHeads;
+            this.concat = concat;
+            this.dropoutRate = dropout;
+            this.outFeatures = outFeatures;
+
+            if (concat) {
+                if (outFeatures % nHeads != 0)
+                    throw new ArgumentException("outFeatures must be a multiple of nHeads when concat is true");
+                this.nHidden = outFeatures / nHeads;
+            } else {
+                this.nHidden = outFeatures;
+            }
+
+            W = Parameter(torch.empty(inFeatures, this.nHidden * nHeads));
+            a = Parameter(torch.empty(nHeads, 2 * this.nHidden, 1));
+
+            leakyrelu = LeakyReLU(leakyReluSlope);
+
+            RegisterComponents();
+            ResetParameters();
+        }
+
+        private void ResetParameters()
+        {
+            init.xavier_normal_(W);
+            init.xavier_normal_(a);
+        }
+
+        private Tensor GetAttentionScores(Tensor hTransformed)
+        {
+            var sourceScores = torch.matmul(hTransformed, a.index(new TensorIndex[] {
+                TensorIndex.Colon, TensorIndex.Slice(null, nHidden), TensorIndex.Colon }));
+            var targetScores = torch.matmul(hTransformed, a.index(new TensorIndex[] {
+                TensorIndex.Colon, TensorIndex.Slice(nHidden), TensorIndex.Colon }));
+
+            // (n_heads, n_nodes, 1) + (n_heads, 1, n_nodes) = (n_heads, n_nodes, n_nodes)
+            var e = sourceScores + targetScores.mT;
+            return leakyrelu.forward(e);
+        }
+
+        public override Tensor forward(Tensor h, Tensor adjMat)
+        {
+            long nNodes = h.shape[0];
+
+            // Apply linear transformation: W * h
+            var hTransformed = torch.mm(h, W);
+            hTransformed = nn.functional.dropout(hTransformed, dropoutRate, training);
+
+            // Reshape to (n_heads, n_nodes, n_hidden)
+            hTransformed = hTransformed.view(nNodes, nHeads, nHidden).permute(1, 0, 2);
+
+            // Get attention scores (n_heads, n_nodes, n_nodes)
+            var e = GetAttentionScores(hTransformed);
+
+            // Mask non-existent edges
+            var connectivityMask = -9e16 * torch.ones_like(e);
+            e = torch.where(adjMat > 0, e, connectivityMask);
+
+            // Softmax over rows
+            var attention = softmax(e, dim: -1);
+            attention = nn.functional.dropout(attention, dropoutRate, training);
+
+            // Weighted average of neighbor features
+            var hPrime = torch.matmul(attention, hTransformed);
+
+            if (concat) {
+                hPrime = hPrime.permute(1, 0, 2).contiguous().view(nNodes, outFeatures);
+            } else {
+                hPrime = hPrime.mean(new long[] { 0 });
+            }
+
+            return hPrime;
+        }
+    }
+
+    /// <summary>
+    /// Graph Attention Network (GAT) based on: https://github.com/pytorch/examples/tree/main/gat
+    ///
+    /// Two-layer GAT for semi-supervised node classification.
+    /// The first layer uses multi-head attention with ELU activation.
+    /// The second layer uses single-head attention with log-softmax output.
+    /// </summary>
+    public class GATModel : Module<Tensor, Tensor, Tensor>
+    {
+        private GraphAttentionLayer gat1;
+        private GraphAttentionLayer gat2;
+
+        public GATModel(string name, int inFeatures, int nHidden, int nHeads, int numClasses,
+            bool concat = false, double dropout = 0.4, double leakyReluSlope = 0.2,
+            torch.Device device = null) : base(name)
+        {
+            gat1 = new GraphAttentionLayer("gat1", inFeatures, nHidden, nHeads,
+                concat: concat, dropout: dropout, leakyReluSlope: leakyReluSlope);
+            gat2 = new GraphAttentionLayer("gat2", nHidden, numClasses, 1,
+                concat: false, dropout: dropout, leakyReluSlope: leakyReluSlope);
+
+            RegisterComponents();
+
+            if (device != null && device.type != DeviceType.CPU)
+                this.to(device);
+        }
+
+        public override Tensor forward(Tensor inputTensor, Tensor adjMat)
+        {
+            var x = gat1.forward(inputTensor, adjMat);
+            x = elu(x, 1.0);
+            x = gat2.forward(x, adjMat);
+            return log_softmax(x, dim: 1);
+        }
+    }
+}
diff --git a/src/CSharp/Models/GCN.cs b/src/CSharp/Models/GCN.cs
new file mode 100644
index 0000000..0260215
--- /dev/null
+++ b/src/CSharp/Models/GCN.cs
@@ -0,0 +1,80 @@
+// Copyright (c) .NET Foundation and Contributors.  All Rights Reserved.  See LICENSE in the project root for license information.
+using System;
+
+using static TorchSharp.torch;
+using static TorchSharp.torch.nn;
+using static TorchSharp.torch.nn.functional;
+
+namespace TorchSharp.Examples
+{
+    /// <summary>
+    /// Graph Convolutional Layer as described in "Semi-Supervised Classification with Graph Convolutional Networks".
+    ///
+    /// H' = f(D^(-1/2) * A * D^(-1/2) * H * W)
+    /// </summary>
+    public class GraphConvLayer : Module<Tensor, Tensor, Tensor>
+    {
+        private Modules.Parameter kernel;
+        private Modules.Parameter bias;
+
+        public GraphConvLayer(string name, int inputDim, int outputDim, bool useBias = false) : base(name)
+        {
+            kernel = Parameter(torch.empty(inputDim, outputDim));
+            init.xavier_normal_(kernel);
+
+            if (useBias) {
+                bias = Parameter(torch.zeros(outputDim));
+            }
+
+            RegisterComponents();
+        }
+
+        public override Tensor forward(Tensor inputTensor, Tensor adjMat)
+        {
+            // Matrix multiplication between input and weight matrix
+            var support = torch.mm(inputTensor, kernel);
+            // Sparse or dense matrix multiplication between adjacency matrix and support
+            var output = torch.mm(adjMat, support);
+
+            if (bias is not null) {
+                output = output + bias;
+            }
+
+            return output;
+        }
+    }
+
+    /// <summary>
+    /// Graph Convolutional Network (GCN) based on: https://github.com/pytorch/examples/tree/main/gcn
+    ///
+    /// Two-layer GCN for semi-supervised node classification on graph data.
+    /// Uses the Cora citation network dataset.
+    /// </summary>
+    public class GCNModel : Module<Tensor, Tensor, Tensor>
+    {
+        private GraphConvLayer gc1;
+        private GraphConvLayer gc2;
+        private Module<Tensor, Tensor> dropout;
+
+        public GCNModel(string name, int inputDim, int hiddenDim, int outputDim, bool useBias = true, double dropoutP = 0.1, torch.Device device = null) : base(name)
+        {
+            gc1 = new GraphConvLayer("gc1", inputDim, hiddenDim, useBias: useBias);
+            gc2 = new GraphConvLayer("gc2", hiddenDim, outputDim, useBias: useBias);
+            dropout = Dropout(dropoutP);
+
+            RegisterComponents();
+
+            if (device != null && device.type != DeviceType.CPU)
+                this.to(device);
+        }
+
+        public override Tensor forward(Tensor inputTensor, Tensor adjMat)
+        {
+            var x = gc1.forward(inputTensor, adjMat);
+            x = relu(x);
+            x = dropout.forward(x);
+            x = gc2.forward(x, adjMat);
+            return log_softmax(x, dim: 1);
+        }
+    }
+}
diff --git a/src/CSharp/Models/SiameseNetwork.cs b/src/CSharp/Models/SiameseNetwork.cs
new file mode 100644
index 0000000..07c28fb
--- /dev/null
+++ b/src/CSharp/Models/SiameseNetwork.cs
@@ -0,0 +1,87 @@
+// Copyright (c) .NET Foundation and Contributors.  All Rights Reserved.  See LICENSE in the project root for license information.
+using System;
+
+using static TorchSharp.torch;
+using static TorchSharp.torch.nn;
+using static TorchSharp.torch.nn.functional;
+
+namespace TorchSharp.Examples
+{
+    /// <summary>
+    /// Siamese Network model based on: https://github.com/pytorch/examples/tree/main/siamese_network
+    ///
+    /// Uses two identical sub-networks (ResNet-18 backbone) to compare pairs of images.
+    /// The network outputs a similarity score (via sigmoid) between 0 and 1.
+    /// Trained with BCELoss on MNIST image pairs.
+    /// </summary>
+    public class SiameseNetworkModel : Module<Tensor, Tensor, Tensor>
+    {
+        private Module<Tensor, Tensor> backbone;
+        private Module<Tensor, Tensor> fc;
+        private Module<Tensor, Tensor> sigmoid = Sigmoid();
+        private long fcInFeatures;
+
+        public SiameseNetworkModel(string name, torch.Device device = null) : base(name)
+        {
+            // Build a simple CNN backbone (similar to a mini ResNet for 28x28 grayscale)
+            // We use a simpler backbone since we don't have torchvision.models in TorchSharp examples
+            var backboneModules = Sequential(
+                ("conv1", Conv2d(1, 32, 3, stride: 2, padding: 1)),
+                ("bn1", BatchNorm2d(32)),
+                ("relu1", ReLU()),
+                ("conv2", Conv2d(32, 64, 3, stride: 2, padding: 1)),
+                ("bn2", BatchNorm2d(64)),
+                ("relu2", ReLU()),
+                ("conv3", Conv2d(64, 128, 3, stride: 2, padding: 1)),
+                ("bn3", BatchNorm2d(128)),
+                ("relu3", ReLU()),
+                ("avgpool", AdaptiveAvgPool2d(1))
+            );
+            backbone = backboneModules;
+            fcInFeatures = 128;
+
+            fc = Sequential(
+                ("fc1", Linear(fcInFeatures * 2, 256)),
+                ("relu", ReLU(inplace: true)),
+                ("fc2", Linear(256, 1))
+            );
+
+            RegisterComponents();
+            InitWeights();
+
+            if (device != null && device.type != DeviceType.CPU)
+                this.to(device);
+        }
+
+        private void InitWeights()
+        {
+            foreach (var (paramName, param) in this.named_parameters()) {
+                if (paramName.Contains("weight") && param.dim() >= 2) {
+                    init.xavier_uniform_(param);
+                } else if (paramName.Contains("bias")) {
+                    init.constant_(param, 0.01);
+                }
+            }
+        }
+
+        private Tensor ForwardOnce(Tensor x)
+        {
+            var output = backbone.forward(x);
+            output = output.view(output.shape[0], -1);
+            return output;
+        }
+
+        public override Tensor forward(Tensor input1, Tensor input2)
+        {
+            var output1 = ForwardOnce(input1);
+            var output2 = ForwardOnce(input2);
+
+            // Concatenate both features
+            var combined = torch.cat(new Tensor[] { output1, output2 }, dim: 1);
+
+            var output = fc.forward(combined);
+            output = sigmoid.forward(output);
+            return output;
+        }
+    }
+}
diff --git a/src/CSharp/Models/SuperResolution.cs b/src/CSharp/Models/SuperResolution.cs
new file mode 100644
index 0000000..ad02a38
--- /dev/null
+++ b/src/CSharp/Models/SuperResolution.cs
@@ -0,0 +1,56 @@
+// Copyright (c) .NET Foundation and Contributors.  All Rights Reserved.  See LICENSE in the project root for license information.
+using System;
+
+using static TorchSharp.torch;
+using static TorchSharp.torch.nn;
+
+namespace TorchSharp.Examples
+{
+    /// <summary>
+    /// Super-resolution model based on: https://github.com/pytorch/examples/tree/main/super_resolution
+    ///
+    /// Uses an efficient sub-pixel convolutional neural network (ESPCN) for super-resolution.
+    /// The model learns to upscale low-resolution images by a given factor.
+    /// </summary>
+    public class SuperResolutionModel : Module<Tensor, Tensor>
+    {
+        private Modules.Conv2d conv1;
+        private Modules.Conv2d conv2;
+        private Modules.Conv2d conv3;
+        private Modules.Conv2d conv4;
+        private Module<Tensor, Tensor> pixelShuffle;
+        private Module<Tensor, Tensor> relu = ReLU();
+
+        public SuperResolutionModel(string name, int upscaleFactor, torch.Device device = null) : base(name)
+        {
+            conv1 = Conv2d(1, 64, 5, stride: 1, padding: 2);
+            conv2 = Conv2d(64, 64, 3, stride: 1, padding: 1);
+            conv3 = Conv2d(64, 32, 3, stride: 1, padding: 1);
+            conv4 = Conv2d(32, upscaleFactor * upscaleFactor, 3, stride: 1, padding: 1);
+            pixelShuffle = PixelShuffle(upscaleFactor);
+
+            RegisterComponents();
+            InitializeWeights();
+
+            if (device != null && device.type != DeviceType.CPU)
+                this.to(device);
+        }
+
+        private void InitializeWeights()
+        {
+            init.orthogonal_(conv1.weight, init.calculate_gain(init.NonlinearityType.ReLU));
+            init.orthogonal_(conv2.weight, init.calculate_gain(init.NonlinearityType.ReLU));
+            init.orthogonal_(conv3.weight, init.calculate_gain(init.NonlinearityType.ReLU));
+            init.orthogonal_(conv4.weight);
+        }
+
+        public override Tensor forward(Tensor input)
+        {
+            var x = relu.forward(conv1.forward(input));
+            x = relu.forward(conv2.forward(x));
+            x = relu.forward(conv3.forward(x));
+            x = pixelShuffle.forward(conv4.forward(x));
+            return x;
+        }
+    }
+}
diff --git a/src/CSharp/Models/TimeSequencePrediction.cs b/src/CSharp/Models/TimeSequencePrediction.cs
new file mode 100644
index 0000000..91da1ed
--- /dev/null
+++ b/src/CSharp/Models/TimeSequencePrediction.cs
@@ -0,0 +1,85 @@
+// Copyright (c) .NET Foundation and Contributors.  All Rights Reserved.  See LICENSE in the project root for license information.
+using System;
+using System.Collections.Generic;
+
+using static TorchSharp.torch;
+using static TorchSharp.torch.nn;
+
+namespace TorchSharp.Examples
+{
+    /// <summary>
+    /// Time sequence prediction model using stacked LSTMCells.
+    ///
+    /// Based on: https://github.com/pytorch/examples/tree/main/time_sequence_prediction
+    ///
+    /// Uses two stacked LSTMCells followed by a linear layer to predict
+    /// future values of a time sequence (sine waves).
+    /// </summary>
+    public class SequenceModel : Module<Tensor, int, Tensor>
+    {
+        private Modules.LSTMCell lstm1;
+        private Modules.LSTMCell lstm2;
+        private Modules.Linear linear;
+
+        public SequenceModel(string name, torch.Device device = null) : base(name)
+        {
+            lstm1 = LSTMCell(1, 51);
+            lstm2 = LSTMCell(51, 51);
+            linear = Linear(51, 1);
+
+            RegisterComponents();
+
+            if (device != null && device.type != DeviceType.CPU)
+                this.to(device);
+        }
+
+        /// <summary>
+        /// Forward pass. Processes the input sequence step by step through two stacked LSTMCells,
+        /// then optionally predicts 'future' additional steps using its own output as input.
+        /// </summary>
+        /// <param name="input">Input tensor of shape (batch_size, sequence_length)</param>
+        /// <param name="future">Number of future steps to predict beyond the input</param>
+        /// <returns>Output tensor of shape (batch_size, sequence_length + future)</returns>
+        public override Tensor forward(Tensor input, int future)
+        {
+            var outputs = new List<Tensor>();
+            var batchSize = input.shape[0];
+
+            // Initialize hidden states and cell states to zeros
+            var h_t = torch.zeros(batchSize, 51, dtype: torch.float64, device: input.device);
+            var c_t = torch.zeros(batchSize, 51, dtype: torch.float64, device: input.device);
+            var h_t2 = torch.zeros(batchSize, 51, dtype: torch.float64, device: input.device);
+            var c_t2 = torch.zeros(batchSize, 51, dtype: torch.float64, device: input.device);
+
+            // Process input sequence
+            var steps = input.split(1, dim: 1);
+            Tensor output = null;
+            foreach (var input_t in steps)
+            {
+                var (h1, c1) = lstm1.forward(input_t, (h_t, c_t));
+                h_t = h1;
+                c_t = c1;
+                var (h2, c2) = lstm2.forward(h_t, (h_t2, c_t2));
+                h_t2 = h2;
+                c_t2 = c2;
+                output = linear.forward(h_t2);
+                outputs.Add(output);
+            }
+
+            // Predict future steps using own output as input
+            for (int i = 0; i < future; i++)
+            {
+                var (h1, c1) = lstm1.forward(output, (h_t, c_t));
+                h_t = h1;
+                c_t = c1;
+                var (h2, c2) = lstm2.forward(h_t, (h_t2, c_t2));
+                h_t2 = h2;
+                c_t2 = c2;
+                output = linear.forward(h_t2);
+                outputs.Add(output);
+            }
+
+            return torch.cat(outputs, dim: 1);
+        }
+    }
+}
diff --git a/src/CSharp/Models/WordLanguageModel.cs b/src/CSharp/Models/WordLanguageModel.cs
new file mode 100644
index 0000000..4e257e3
--- /dev/null
+++ b/src/CSharp/Models/WordLanguageModel.cs
@@ -0,0 +1,132 @@
+// Copyright (c) .NET Foundation and Contributors.  All Rights Reserved.  See LICENSE in the project root for license information.
+using System;
+
+using static TorchSharp.torch;
+using static TorchSharp.torch.nn;
+
+namespace TorchSharp.Examples
+{
+    /// <summary>
+    /// Word-level language model using RNN (LSTM/GRU/RNN).
+    ///
+    /// Based on: https://github.com/pytorch/examples/tree/main/word_language_model
+    ///
+    /// Container module with an encoder (embedding), a recurrent module, and a decoder (linear).
+    /// Supports LSTM, GRU, RNN_TANH, and RNN_RELU model types.
+    /// </summary>
+    public class RNNModel : Module<Tensor, Tensor, (Tensor output, Tensor hidden)>
+    {
+        private Modules.Dropout drop;
+        private Modules.Embedding encoder;
+        private Modules.Linear decoder;
+        private torch.nn.Module<Tensor, Tensor, (Tensor, Tensor)> rnn_gru;
+        private Modules.LSTM rnn_lstm;
+        private torch.nn.Module<Tensor, Tensor, (Tensor, Tensor)> rnn_plain;
+
+        private string rnn_type;
+        private long nhid;
+        private long nlayers;
+
+        public RNNModel(string rnn_type, long ntoken, long ninp, long nhid, long nlayers, double dropout = 0.5, bool tie_weights = false) : base("RNNModel")
+        {
+            this.rnn_type = rnn_type;
+            this.nhid = nhid;
+            this.nlayers = nlayers;
+
+            drop = Dropout(dropout);
+            encoder = Embedding(ntoken, ninp);
+
+            switch (rnn_type)
+            {
+                case "LSTM":
+                    rnn_lstm = LSTM(ninp, nhid, numLayers: nlayers, dropout: dropout);
+                    break;
+                case "GRU":
+                    rnn_gru = GRU(ninp, nhid, numLayers: nlayers, dropout: dropout);
+                    break;
+                case "RNN_TANH":
+                    rnn_plain = RNN(ninp, nhid, numLayers: nlayers, nonLinearity: NonLinearities.Tanh, dropout: dropout);
+                    break;
+                case "RNN_RELU":
+                    rnn_plain = RNN(ninp, nhid, numLayers: nlayers, nonLinearity: NonLinearities.ReLU, dropout: dropout);
+                    break;
+                default:
+                    throw new ArgumentException($"Invalid model type: '{rnn_type}'. Options are: LSTM, GRU, RNN_TANH, RNN_RELU");
+            }
+
+            decoder = Linear(nhid, ntoken);
+
+            // Optionally tie weights
+            if (tie_weights)
+            {
+                if (nhid != ninp)
+                    throw new ArgumentException("When using the tied flag, nhid must be equal to emsize");
+                decoder.weight = encoder.weight;
+            }
+
+            InitWeights();
+            RegisterComponents();
+        }
+
+        private void InitWeights()
+        {
+            var initrange = 0.1;
+            init.uniform_(encoder.weight, -initrange, initrange);
+            init.zeros_(decoder.bias);
+            init.uniform_(decoder.weight, -initrange, initrange);
+        }
+
+        public override (Tensor output, Tensor hidden) forward(Tensor input, Tensor hidden)
+        {
+            var emb = drop.forward(encoder.forward(input));
+            Tensor output;
+
+            switch (rnn_type)
+            {
+                case "LSTM":
+                    // For LSTM, hidden is a concatenation of h and c along dim 0
+                    var h = hidden[TensorIndex.Slice(0, nlayers)];
+                    var c = hidden[TensorIndex.Slice(nlayers, null)];
+                    var (lstm_out, h_n, c_n) = rnn_lstm.forward(emb, (h, c));
+                    output = lstm_out;
+                    // Concatenate h and c back together
+                    hidden = torch.cat(new[] { h_n, c_n }, dim: 0);
+                    break;
+                case "GRU":
+                    var (gru_out, gru_hidden) = rnn_gru.forward(emb, hidden);
+                    output = gru_out;
+                    hidden = gru_hidden;
+                    break;
+                default:
+                    var (rnn_out, rnn_hidden) = rnn_plain.forward(emb, hidden);
+                    output = rnn_out;
+                    hidden = rnn_hidden;
+                    break;
+            }
+
+            output = drop.forward(output);
+            var decoded = decoder.forward(output);
+            decoded = decoded.view(-1, decoded.shape[decoded.dim() - 1]);
+            return (torch.nn.functional.log_softmax(decoded, dim: 1), hidden);
+        }
+
+        /// <summary>
+        /// Initialize hidden state for the RNN.
+        /// For LSTM, returns h and c concatenated along dim 0.
+        /// For other RNN types, returns a single hidden state tensor.
+        /// </summary>
+        public Tensor InitHidden(long batchSize, torch.Device device)
+        {
+            if (rnn_type == "LSTM")
+            {
+                var h = torch.zeros(nlayers, batchSize, nhid, device: device);
+                var c = torch.zeros(nlayers, batchSize, nhid, device: device);
+                return torch.cat(new[] { h, c }, dim: 0);
+            }
+            else
+            {
+                return torch.zeros(nlayers, batchSize, nhid, device: device);
+            }
+        }
+    }
+}