feat(sdk/flutter): add dataset upload APIs and comprehensive examples

Add comprehensive dataset management to the Flutter SDK including: - Dataset formats: JSONL, CSV, Parquet, Arrow, HuggingFace, TFRecord, WebDataset, Text, ImageFolder, Custom - Dataset types: text completion, instruction tuning, chat, Q&A, classification, NER, vision, audio - Upload methods: uploadDataset, uploadDatasetFromFile, createDatasetFromRecords - Management APIs: listDatasets, getDataset, deleteDataset - Dataset preprocessing: splitting, shuffling, deduplication, tokenization - Complete examples showing all formats and use cases
2026-01-11 16:47:47 +05:30 · 2026-01-11 16:47:47 +05:30 · cb071a7a3b
commit cb071a7a3b
parent 89fc542da4
4 changed files with 604 additions and 25 deletions
--- a/sdk/flutter/example/example.dart
+++ b/sdk/flutter/example/example.dart
@ -1,3 +1,4 @@
+import 'dart:convert';
 import 'dart:io';

 import 'package:synor_compute/synor_compute.dart';
@ -41,6 +42,9 @@ void main() async {

    // Example 8: Custom model upload
    await customModelExample(client);
+
+    // Example 9: Dataset upload formats
+    await datasetUploadExamples(client);
  } finally {
    // Always dispose client to release resources
    client.dispose();
@ -237,16 +241,46 @@ Future<void> modelRegistryExample(SynorCompute client) async {
 Future<void> trainingExample(SynorCompute client) async {
  print('=== Model Training ===');

-  // Example: Fine-tune Llama 3 8B on custom dataset
-  print('Fine-tuning llama-3-8b on custom dataset...');
+  // ========== STEP 1: Upload your dataset ==========
+  print('Step 1: Uploading training dataset...\n');

-  // Note: In practice, you'd upload your dataset first:
-  // final datasetCid = await client.uploadTensor(datasetTensor);
+  // Example 1: JSONL format (most common for LLM fine-tuning)
+  final jsonlData = '''
+{"prompt": "What is the capital of France?", "completion": "Paris"}
+{"prompt": "Translate 'hello' to Spanish", "completion": "hola"}
+{"prompt": "What is 2 + 2?", "completion": "4"}
+{"prompt": "Who wrote Romeo and Juliet?", "completion": "William Shakespeare"}
+''';
+
+  final dataset = await client.uploadDataset(
+    utf8.encode(jsonlData),
+    DatasetUploadOptions(
+      name: 'qa-training-data',
+      description: 'Question-answering training dataset',
+      format: DatasetFormat.jsonl,
+      type: DatasetType.textCompletion,
+      split: DatasetSplit(train: 0.8, validation: 0.1, test: 0.1, seed: 42),
+      preprocessing: DatasetPreprocessing(
+        maxLength: 2048,
+        shuffle: true,
+        deduplicate: true,
+      ),
+    ),
+  );
+
+  print('Dataset uploaded!');
+  print('  CID: ${dataset.cid}');
+  print('  Total samples: ${dataset.totalSamples}');
+  print('  Train/Val/Test: ${dataset.trainSamples}/${dataset.validationSamples}/${dataset.testSamples}');
+  print('  Schema: ${dataset.schema}');
+
+  // ========== STEP 2: Fine-tune the model ==========
+  print('\nStep 2: Fine-tuning llama-3-8b on dataset...\n');

  final result = await client.fineTune(
-    baseModel: 'llama-3-8b', // Use model alias
-    datasetCid: 'QmYourDatasetCID', // Your uploaded dataset
-    outputAlias: 'my-custom-llama', // Optional: alias for trained model
+    baseModel: 'llama-3-8b',
+    datasetCid: dataset.cid, // Use the CID from upload
+    outputAlias: 'my-qa-model',
    options: TrainingOptions(
      framework: MlFramework.pytorch,
      epochs: 3,
@ -258,7 +292,7 @@ Future<void> trainingExample(SynorCompute client) async {
        'warmup_steps': 100,
        'gradient_accumulation_steps': 4,
      },
-      checkpointEvery: 500, // Save checkpoint every 500 steps
+      checkpointEvery: 500,
      processor: ProcessorType.gpu,
      priority: Priority.high,
    ),
@ -271,13 +305,12 @@ Future<void> trainingExample(SynorCompute client) async {
    print('  Final loss: ${training.finalLoss.toStringAsFixed(4)}');
    print('  Duration: ${training.durationMs / 1000}s');
    print('  Cost: \$${training.cost.toStringAsFixed(4)}');
-    print('  Metrics: ${training.metrics}');

-    // Now use your trained model for inference
-    print('\nUsing trained model for inference:');
+    // ========== STEP 3: Use your trained model ==========
+    print('\nStep 3: Testing trained model...\n');
    final inference = await client.inference(
-      training.modelCid, // Use the CID of your trained model
-      'Hello, how are you?',
+      training.modelCid,
+      'What is the capital of Germany?',
      options: InferenceOptions(maxTokens: 50),
    );
    print('Response: ${inference.result}');
@ -286,19 +319,143 @@ Future<void> trainingExample(SynorCompute client) async {
  }

  print('');
+}

-  // Example: Streaming training progress
-  print('Training with streaming progress...');
-  await for (final progress in client.trainStream(
-    modelCid: 'llama-3-8b',
-    datasetCid: 'QmYourDatasetCID',
-    options: TrainingOptions(epochs: 1, batchSize: 16),
-  )) {
-    // Update UI with progress
-    stdout.write('\r${progress.progressText} - '
-        '${progress.samplesPerSecond} samples/s');
+/// Dataset upload examples - shows all supported formats
+Future<void> datasetUploadExamples(SynorCompute client) async {
+  print('=== Dataset Upload Examples ===\n');
+
+  // ========== FORMAT 1: JSONL (JSON Lines) ==========
+  print('Format 1: JSONL - One JSON object per line');
+  print('''
+  // Text completion format
+  {"prompt": "Hello", "completion": "Hi there!"}
+
+  // Instruction tuning format
+  {"instruction": "Summarize", "input": "Long text...", "output": "Summary"}
+
+  // Chat format
+  {"messages": [{"role": "user", "content": "Hi"}, {"role": "assistant", "content": "Hello!"}]}
+''');
+
+  // Example: Instruction tuning dataset
+  final instructionData = await client.createDatasetFromRecords(
+    name: 'instruction-dataset',
+    records: [
+      {
+        'instruction': 'Summarize the following text',
+        'input': 'The quick brown fox jumps over the lazy dog.',
+        'output': 'A fox jumps over a dog.'
+      },
+      {
+        'instruction': 'Translate to French',
+        'input': 'Hello world',
+        'output': 'Bonjour le monde'
+      },
+    ],
+    type: DatasetType.instructionTuning,
+  );
+  print('Instruction dataset CID: ${instructionData.cid}');
+
+  // ========== FORMAT 2: CSV ==========
+  print('\nFormat 2: CSV - Comma-separated values with headers');
+  print('''
+  prompt,completion
+  "What is AI?","Artificial Intelligence is..."
+  "Define ML","Machine Learning is..."
+''');
+
+  final csvData = '''
+prompt,completion
+"What is AI?","Artificial Intelligence is the simulation of human intelligence"
+"Define ML","Machine Learning is a subset of AI that learns from data"
+''';
+
+  final csvDataset = await client.uploadDataset(
+    utf8.encode(csvData),
+    DatasetUploadOptions(
+      name: 'csv-dataset',
+      format: DatasetFormat.csv,
+      type: DatasetType.textCompletion,
+      columnMapping: {'prompt': 'input', 'completion': 'output'},
+    ),
+  );
+  print('CSV dataset CID: ${csvDataset.cid}');
+
+  // ========== FORMAT 3: Parquet (for large datasets) ==========
+  print('\nFormat 3: Parquet - Efficient columnar format for large datasets');
+  print('  - Best for datasets > 1GB');
+  print('  - Supports compression');
+  print('  - Fast random access');
+  print('''
+  final parquetDataset = await client.uploadDatasetFromFile(
+    '/path/to/dataset.parquet',
+    DatasetUploadOptions(
+      name: 'large-dataset',
+      format: DatasetFormat.parquet,
+      type: DatasetType.textCompletion,
+    ),
+  );
+''');
+
+  // ========== FORMAT 4: HuggingFace ==========
+  print('\nFormat 4: HuggingFace datasets format');
+  print('  - Compatible with datasets library');
+  print('  - Automatic schema detection');
+
+  // ========== FORMAT 5: Image folder ==========
+  print('\nFormat 5: Image folder structure');
+  print('''
+  dataset/
+  ├── train/
+  │   ├── cat/
+  │   │   ├── img001.jpg
+  │   │   └── img002.jpg
+  │   └── dog/
+  │       ├── img001.jpg
+  │       └── img002.jpg
+  └── val/
+      ├── cat/
+      └── dog/
+''');
+
+  // ========== ALL SUPPORTED FORMATS ==========
+  print('\nAll supported dataset formats:');
+  for (final format in DatasetFormat.values) {
+    final description = switch (format) {
+      DatasetFormat.jsonl => 'JSON Lines - one JSON per line (recommended for text)',
+      DatasetFormat.csv => 'CSV - comma-separated with headers',
+      DatasetFormat.parquet => 'Parquet - columnar format for large datasets',
+      DatasetFormat.arrow => 'Apache Arrow - in-memory format',
+      DatasetFormat.huggingface => 'HuggingFace datasets format',
+      DatasetFormat.tfrecord => 'TFRecord - TensorFlow format',
+      DatasetFormat.webdataset => 'WebDataset - PyTorch streaming format',
+      DatasetFormat.text => 'Plain text - one sample per line',
+      DatasetFormat.imagefolder => 'Image folder structure',
+      DatasetFormat.custom => 'Custom binary format',
+    };
+    print('  ${format.value.padRight(15)} - $description');
+  }
+
+  // ========== ALL DATASET TYPES ==========
+  print('\nAll supported dataset types:');
+  for (final type in DatasetType.values) {
+    final description = switch (type) {
+      DatasetType.textCompletion => 'prompt → completion pairs',
+      DatasetType.instructionTuning => 'instruction + input → output',
+      DatasetType.chat => 'multi-turn conversations',
+      DatasetType.questionAnswering => 'question → answer pairs',
+      DatasetType.textClassification => 'text → label',
+      DatasetType.ner => 'named entity recognition',
+      DatasetType.imageClassification => 'image → label',
+      DatasetType.objectDetection => 'image → bounding boxes',
+      DatasetType.imageSegmentation => 'image → mask',
+      DatasetType.imageText => 'image-text pairs (CLIP, etc.)',
+      DatasetType.audioTranscription => 'audio → text',
+      DatasetType.custom => 'custom format',
+    };
+    print('  ${type.value.padRight(22)} - $description');
  }
-  print('\nTraining complete!');

  print('');
 }
--- a/sdk/flutter/lib/src/client.dart
+++ b/sdk/flutter/lib/src/client.dart
@ -501,6 +501,164 @@ class SynorCompute {
    await _delete('/models/$modelId');
  }

+  // ==================== Dataset Management ====================
+
+  /// Upload a dataset for training.
+  ///
+  /// Supports multiple formats: JSONL, CSV, Parquet, Arrow, HuggingFace, etc.
+  ///
+  /// Example (JSONL format):
+  /// ```dart
+  /// // Create JSONL dataset
+  /// final jsonlData = '''
+  /// {"prompt": "What is 2+2?", "completion": "4"}
+  /// {"prompt": "Capital of France?", "completion": "Paris"}
+  /// {"prompt": "Hello", "completion": "Hi there!"}
+  /// ''';
+  ///
+  /// final dataset = await client.uploadDataset(
+  ///   utf8.encode(jsonlData),
+  ///   DatasetUploadOptions(
+  ///     name: 'my-qa-dataset',
+  ///     format: DatasetFormat.jsonl,
+  ///     type: DatasetType.textCompletion,
+  ///     split: DatasetSplit(train: 0.8, validation: 0.1, test: 0.1),
+  ///   ),
+  /// );
+  /// print('Dataset CID: ${dataset.cid}');
+  /// print('Total samples: ${dataset.totalSamples}');
+  /// ```
+  Future<DatasetUploadResult> uploadDataset(
+    List<int> data,
+    DatasetUploadOptions options,
+  ) async {
+    _checkDisposed();
+
+    final uri = Uri.parse('${_config.baseUrl}/datasets/upload');
+    final request = http.MultipartRequest('POST', uri)
+      ..headers.addAll(_headers)
+      ..fields.addAll(options.toJson().map((k, v) {
+        if (v is Map || v is List) {
+          return MapEntry(k, jsonEncode(v));
+        }
+        return MapEntry(k, v.toString());
+      }))
+      ..files.add(http.MultipartFile.fromBytes(
+        'dataset',
+        data,
+        filename: '${options.name}.${options.format.value}',
+      ));
+
+    final streamedResponse = await _httpClient.send(request);
+    final response = await http.Response.fromStream(streamedResponse);
+
+    if (response.statusCode != 200) {
+      throw SynorException(
+        'Dataset upload failed',
+        statusCode: response.statusCode,
+      );
+    }
+
+    final json = jsonDecode(response.body) as Map<String, dynamic>;
+    return DatasetUploadResult.fromJson(json);
+  }
+
+  /// Upload a dataset from a file path.
+  Future<DatasetUploadResult> uploadDatasetFromFile(
+    String filePath,
+    DatasetUploadOptions options,
+  ) async {
+    _checkDisposed();
+
+    final uri = Uri.parse('${_config.baseUrl}/datasets/upload');
+    final request = http.MultipartRequest('POST', uri)
+      ..headers.addAll(_headers)
+      ..fields.addAll(options.toJson().map((k, v) {
+        if (v is Map || v is List) {
+          return MapEntry(k, jsonEncode(v));
+        }
+        return MapEntry(k, v.toString());
+      }))
+      ..files.add(await http.MultipartFile.fromPath('dataset', filePath));
+
+    final streamedResponse = await _httpClient.send(request);
+    final response = await http.Response.fromStream(streamedResponse);
+
+    if (response.statusCode != 200) {
+      throw SynorException(
+        'Dataset upload failed',
+        statusCode: response.statusCode,
+      );
+    }
+
+    final json = jsonDecode(response.body) as Map<String, dynamic>;
+    return DatasetUploadResult.fromJson(json);
+  }
+
+  /// List uploaded datasets.
+  Future<List<DatasetInfo>> listDatasets({DatasetType? type}) async {
+    _checkDisposed();
+
+    final params = <String, String>{
+      if (type != null) 'type': type.value,
+    };
+
+    final response = await _get('/datasets', params);
+    final datasets = response['datasets'] as List;
+    return datasets
+        .map((d) => DatasetInfo.fromJson(d as Map<String, dynamic>))
+        .toList();
+  }
+
+  /// Get dataset info by ID or CID.
+  Future<DatasetInfo> getDataset(String datasetId) async {
+    _checkDisposed();
+
+    final response = await _get('/datasets/$datasetId');
+    return DatasetInfo.fromJson(response);
+  }
+
+  /// Delete a dataset.
+  Future<void> deleteDataset(String datasetId) async {
+    _checkDisposed();
+
+    await _delete('/datasets/$datasetId');
+  }
+
+  /// Create a dataset from inline data (convenience method).
+  ///
+  /// Example (instruction tuning):
+  /// ```dart
+  /// final dataset = await client.createDatasetFromRecords(
+  ///   name: 'instruction-dataset',
+  ///   records: [
+  ///     {'instruction': 'Summarize:', 'input': 'Long text...', 'output': 'Summary'},
+  ///     {'instruction': 'Translate:', 'input': 'Hello', 'output': 'Hola'},
+  ///   ],
+  ///   type: DatasetType.instructionTuning,
+  /// );
+  /// ```
+  Future<DatasetUploadResult> createDatasetFromRecords({
+    required String name,
+    required List<Map<String, dynamic>> records,
+    DatasetType type = DatasetType.textCompletion,
+    DatasetSplit? split,
+  }) async {
+    // Convert to JSONL format
+    final jsonlLines = records.map((r) => jsonEncode(r)).join('\n');
+    final data = utf8.encode(jsonlLines);
+
+    return uploadDataset(
+      data,
+      DatasetUploadOptions(
+        name: name,
+        format: DatasetFormat.jsonl,
+        type: type,
+        split: split,
+      ),
+    );
+  }
+
  // ==================== Training ====================

  /// Train a model on a dataset.
--- a/sdk/flutter/lib/src/types.dart
+++ b/sdk/flutter/lib/src/types.dart
@ -595,6 +595,262 @@ class ModelUploadResult {
      );
 }

+/// Dataset format for training.
+enum DatasetFormat {
+  /// JSON Lines format - one JSON object per line
+  jsonl('jsonl'),
+  /// CSV format with headers
+  csv('csv'),
+  /// Parquet columnar format (efficient for large datasets)
+  parquet('parquet'),
+  /// Apache Arrow format
+  arrow('arrow'),
+  /// HuggingFace datasets format
+  huggingface('huggingface'),
+  /// TFRecord format (TensorFlow)
+  tfrecord('tfrecord'),
+  /// WebDataset format (PyTorch)
+  webdataset('webdataset'),
+  /// Raw text files (one sample per line)
+  text('text'),
+  /// Image folder structure
+  imagefolder('imagefolder'),
+  /// Custom binary format
+  custom('custom');
+
+  const DatasetFormat(this.value);
+  final String value;
+
+  static DatasetFormat fromString(String s) =>
+      DatasetFormat.values.firstWhere((f) => f.value == s, orElse: () => jsonl);
+}
+
+/// Dataset type/task.
+enum DatasetType {
+  /// Text completion (prompt → completion)
+  textCompletion('text_completion'),
+  /// Instruction following (instruction, input, output)
+  instructionTuning('instruction_tuning'),
+  /// Chat/conversation format
+  chat('chat'),
+  /// Question answering
+  questionAnswering('question_answering'),
+  /// Text classification
+  textClassification('text_classification'),
+  /// Named entity recognition
+  ner('ner'),
+  /// Image classification
+  imageClassification('image_classification'),
+  /// Object detection
+  objectDetection('object_detection'),
+  /// Image segmentation
+  imageSegmentation('image_segmentation'),
+  /// Image-text pairs
+  imageText('image_text'),
+  /// Audio transcription
+  audioTranscription('audio_transcription'),
+  /// Custom format
+  custom('custom');
+
+  const DatasetType(this.value);
+  final String value;
+
+  static DatasetType fromString(String s) =>
+      DatasetType.values.firstWhere((t) => t.value == s, orElse: () => custom);
+}
+
+/// Dataset upload options.
+class DatasetUploadOptions {
+  /// Dataset name
+  final String name;
+  /// Description
+  final String? description;
+  /// Dataset format
+  final DatasetFormat format;
+  /// Dataset type/task
+  final DatasetType type;
+  /// Column mapping (for CSV/Parquet)
+  final Map<String, String>? columnMapping;
+  /// Train/validation/test split ratios
+  final DatasetSplit? split;
+  /// Preprocessing options
+  final DatasetPreprocessing? preprocessing;
+  /// Is public
+  final bool isPublic;
+
+  const DatasetUploadOptions({
+    required this.name,
+    this.description,
+    this.format = DatasetFormat.jsonl,
+    this.type = DatasetType.textCompletion,
+    this.columnMapping,
+    this.split,
+    this.preprocessing,
+    this.isPublic = false,
+  });
+
+  Map<String, dynamic> toJson() => {
+        'name': name,
+        if (description != null) 'description': description,
+        'format': format.value,
+        'type': type.value,
+        if (columnMapping != null) 'column_mapping': columnMapping,
+        if (split != null) 'split': split!.toJson(),
+        if (preprocessing != null) 'preprocessing': preprocessing!.toJson(),
+        'is_public': isPublic,
+      };
+}
+
+/// Dataset split configuration.
+class DatasetSplit {
+  /// Training set ratio (0.0-1.0)
+  final double train;
+  /// Validation set ratio (0.0-1.0)
+  final double validation;
+  /// Test set ratio (0.0-1.0)
+  final double test;
+  /// Random seed for reproducibility
+  final int? seed;
+
+  const DatasetSplit({
+    this.train = 0.8,
+    this.validation = 0.1,
+    this.test = 0.1,
+    this.seed,
+  });
+
+  Map<String, dynamic> toJson() => {
+        'train': train,
+        'validation': validation,
+        'test': test,
+        if (seed != null) 'seed': seed,
+      };
+}
+
+/// Dataset preprocessing options.
+class DatasetPreprocessing {
+  /// Maximum sequence length (for text)
+  final int? maxLength;
+  /// Truncation strategy
+  final String? truncation;
+  /// Tokenizer to use
+  final String? tokenizer;
+  /// Image size (for vision datasets)
+  final List<int>? imageSize;
+  /// Normalize images
+  final bool? normalizeImages;
+  /// Shuffle dataset
+  final bool shuffle;
+  /// Remove duplicates
+  final bool deduplicate;
+
+  const DatasetPreprocessing({
+    this.maxLength,
+    this.truncation,
+    this.tokenizer,
+    this.imageSize,
+    this.normalizeImages,
+    this.shuffle = true,
+    this.deduplicate = false,
+  });
+
+  Map<String, dynamic> toJson() => {
+        if (maxLength != null) 'max_length': maxLength,
+        if (truncation != null) 'truncation': truncation,
+        if (tokenizer != null) 'tokenizer': tokenizer,
+        if (imageSize != null) 'image_size': imageSize,
+        if (normalizeImages != null) 'normalize_images': normalizeImages,
+        'shuffle': shuffle,
+        'deduplicate': deduplicate,
+      };
+}
+
+/// Dataset upload result.
+class DatasetUploadResult {
+  /// Dataset CID (use this for training)
+  final String cid;
+  /// Dataset ID
+  final String datasetId;
+  /// Total samples
+  final int totalSamples;
+  /// Train samples
+  final int trainSamples;
+  /// Validation samples
+  final int validationSamples;
+  /// Test samples
+  final int testSamples;
+  /// Size in bytes
+  final int sizeBytes;
+  /// Schema detected
+  final Map<String, String>? schema;
+
+  const DatasetUploadResult({
+    required this.cid,
+    required this.datasetId,
+    required this.totalSamples,
+    required this.trainSamples,
+    required this.validationSamples,
+    required this.testSamples,
+    required this.sizeBytes,
+    this.schema,
+  });
+
+  factory DatasetUploadResult.fromJson(Map<String, dynamic> json) =>
+      DatasetUploadResult(
+        cid: json['cid'] as String,
+        datasetId: json['dataset_id'] as String,
+        totalSamples: json['total_samples'] as int,
+        trainSamples: json['train_samples'] as int,
+        validationSamples: json['validation_samples'] as int,
+        testSamples: json['test_samples'] as int,
+        sizeBytes: json['size_bytes'] as int,
+        schema: (json['schema'] as Map<String, dynamic>?)?.cast<String, String>(),
+      );
+}
+
+/// Dataset info.
+class DatasetInfo {
+  final String id;
+  final String name;
+  final String? description;
+  final String cid;
+  final DatasetFormat format;
+  final DatasetType type;
+  final int totalSamples;
+  final int sizeBytes;
+  final Map<String, String>? schema;
+  final bool isPublic;
+  final DateTime createdAt;
+
+  const DatasetInfo({
+    required this.id,
+    required this.name,
+    this.description,
+    required this.cid,
+    required this.format,
+    required this.type,
+    required this.totalSamples,
+    required this.sizeBytes,
+    this.schema,
+    required this.isPublic,
+    required this.createdAt,
+  });
+
+  factory DatasetInfo.fromJson(Map<String, dynamic> json) => DatasetInfo(
+        id: json['id'] as String,
+        name: json['name'] as String,
+        description: json['description'] as String?,
+        cid: json['cid'] as String,
+        format: DatasetFormat.fromString(json['format'] as String),
+        type: DatasetType.fromString(json['type'] as String),
+        totalSamples: json['total_samples'] as int,
+        sizeBytes: json['size_bytes'] as int,
+        schema: (json['schema'] as Map<String, dynamic>?)?.cast<String, String>(),
+        isPublic: json['is_public'] as bool? ?? false,
+        createdAt: DateTime.parse(json['created_at'] as String),
+      );
+}
+
 /// Training progress update.
 class TrainingProgress {
  final String jobId;
--- a/sdk/flutter/lib/synor_compute.dart
+++ b/sdk/flutter/lib/synor_compute.dart
@ -94,7 +94,15 @@ export 'src/types.dart'
        // Training types
        TrainingOptions,
        TrainingResult,
-        TrainingProgress;
+        TrainingProgress,
+        // Dataset types
+        DatasetFormat,
+        DatasetType,
+        DatasetUploadOptions,
+        DatasetSplit,
+        DatasetPreprocessing,
+        DatasetUploadResult,
+        DatasetInfo;

 export 'src/tensor.dart' show Tensor;