diff --git a/sdk/flutter/example/example.dart b/sdk/flutter/example/example.dart index a349933..12683e7 100644 --- a/sdk/flutter/example/example.dart +++ b/sdk/flutter/example/example.dart @@ -1,3 +1,4 @@ +import 'dart:convert'; import 'dart:io'; import 'package:synor_compute/synor_compute.dart'; @@ -41,6 +42,9 @@ void main() async { // Example 8: Custom model upload await customModelExample(client); + + // Example 9: Dataset upload formats + await datasetUploadExamples(client); } finally { // Always dispose client to release resources client.dispose(); @@ -237,16 +241,46 @@ Future modelRegistryExample(SynorCompute client) async { Future trainingExample(SynorCompute client) async { print('=== Model Training ==='); - // Example: Fine-tune Llama 3 8B on custom dataset - print('Fine-tuning llama-3-8b on custom dataset...'); + // ========== STEP 1: Upload your dataset ========== + print('Step 1: Uploading training dataset...\n'); - // Note: In practice, you'd upload your dataset first: - // final datasetCid = await client.uploadTensor(datasetTensor); + // Example 1: JSONL format (most common for LLM fine-tuning) + final jsonlData = ''' +{"prompt": "What is the capital of France?", "completion": "Paris"} +{"prompt": "Translate 'hello' to Spanish", "completion": "hola"} +{"prompt": "What is 2 + 2?", "completion": "4"} +{"prompt": "Who wrote Romeo and Juliet?", "completion": "William Shakespeare"} +'''; + + final dataset = await client.uploadDataset( + utf8.encode(jsonlData), + DatasetUploadOptions( + name: 'qa-training-data', + description: 'Question-answering training dataset', + format: DatasetFormat.jsonl, + type: DatasetType.textCompletion, + split: DatasetSplit(train: 0.8, validation: 0.1, test: 0.1, seed: 42), + preprocessing: DatasetPreprocessing( + maxLength: 2048, + shuffle: true, + deduplicate: true, + ), + ), + ); + + print('Dataset uploaded!'); + print(' CID: ${dataset.cid}'); + print(' Total samples: ${dataset.totalSamples}'); + print(' Train/Val/Test: ${dataset.trainSamples}/${dataset.validationSamples}/${dataset.testSamples}'); + print(' Schema: ${dataset.schema}'); + + // ========== STEP 2: Fine-tune the model ========== + print('\nStep 2: Fine-tuning llama-3-8b on dataset...\n'); final result = await client.fineTune( - baseModel: 'llama-3-8b', // Use model alias - datasetCid: 'QmYourDatasetCID', // Your uploaded dataset - outputAlias: 'my-custom-llama', // Optional: alias for trained model + baseModel: 'llama-3-8b', + datasetCid: dataset.cid, // Use the CID from upload + outputAlias: 'my-qa-model', options: TrainingOptions( framework: MlFramework.pytorch, epochs: 3, @@ -258,7 +292,7 @@ Future trainingExample(SynorCompute client) async { 'warmup_steps': 100, 'gradient_accumulation_steps': 4, }, - checkpointEvery: 500, // Save checkpoint every 500 steps + checkpointEvery: 500, processor: ProcessorType.gpu, priority: Priority.high, ), @@ -271,13 +305,12 @@ Future trainingExample(SynorCompute client) async { print(' Final loss: ${training.finalLoss.toStringAsFixed(4)}'); print(' Duration: ${training.durationMs / 1000}s'); print(' Cost: \$${training.cost.toStringAsFixed(4)}'); - print(' Metrics: ${training.metrics}'); - // Now use your trained model for inference - print('\nUsing trained model for inference:'); + // ========== STEP 3: Use your trained model ========== + print('\nStep 3: Testing trained model...\n'); final inference = await client.inference( - training.modelCid, // Use the CID of your trained model - 'Hello, how are you?', + training.modelCid, + 'What is the capital of Germany?', options: InferenceOptions(maxTokens: 50), ); print('Response: ${inference.result}'); @@ -286,19 +319,143 @@ Future trainingExample(SynorCompute client) async { } print(''); +} - // Example: Streaming training progress - print('Training with streaming progress...'); - await for (final progress in client.trainStream( - modelCid: 'llama-3-8b', - datasetCid: 'QmYourDatasetCID', - options: TrainingOptions(epochs: 1, batchSize: 16), - )) { - // Update UI with progress - stdout.write('\r${progress.progressText} - ' - '${progress.samplesPerSecond} samples/s'); +/// Dataset upload examples - shows all supported formats +Future datasetUploadExamples(SynorCompute client) async { + print('=== Dataset Upload Examples ===\n'); + + // ========== FORMAT 1: JSONL (JSON Lines) ========== + print('Format 1: JSONL - One JSON object per line'); + print(''' + // Text completion format + {"prompt": "Hello", "completion": "Hi there!"} + + // Instruction tuning format + {"instruction": "Summarize", "input": "Long text...", "output": "Summary"} + + // Chat format + {"messages": [{"role": "user", "content": "Hi"}, {"role": "assistant", "content": "Hello!"}]} +'''); + + // Example: Instruction tuning dataset + final instructionData = await client.createDatasetFromRecords( + name: 'instruction-dataset', + records: [ + { + 'instruction': 'Summarize the following text', + 'input': 'The quick brown fox jumps over the lazy dog.', + 'output': 'A fox jumps over a dog.' + }, + { + 'instruction': 'Translate to French', + 'input': 'Hello world', + 'output': 'Bonjour le monde' + }, + ], + type: DatasetType.instructionTuning, + ); + print('Instruction dataset CID: ${instructionData.cid}'); + + // ========== FORMAT 2: CSV ========== + print('\nFormat 2: CSV - Comma-separated values with headers'); + print(''' + prompt,completion + "What is AI?","Artificial Intelligence is..." + "Define ML","Machine Learning is..." +'''); + + final csvData = ''' +prompt,completion +"What is AI?","Artificial Intelligence is the simulation of human intelligence" +"Define ML","Machine Learning is a subset of AI that learns from data" +'''; + + final csvDataset = await client.uploadDataset( + utf8.encode(csvData), + DatasetUploadOptions( + name: 'csv-dataset', + format: DatasetFormat.csv, + type: DatasetType.textCompletion, + columnMapping: {'prompt': 'input', 'completion': 'output'}, + ), + ); + print('CSV dataset CID: ${csvDataset.cid}'); + + // ========== FORMAT 3: Parquet (for large datasets) ========== + print('\nFormat 3: Parquet - Efficient columnar format for large datasets'); + print(' - Best for datasets > 1GB'); + print(' - Supports compression'); + print(' - Fast random access'); + print(''' + final parquetDataset = await client.uploadDatasetFromFile( + '/path/to/dataset.parquet', + DatasetUploadOptions( + name: 'large-dataset', + format: DatasetFormat.parquet, + type: DatasetType.textCompletion, + ), + ); +'''); + + // ========== FORMAT 4: HuggingFace ========== + print('\nFormat 4: HuggingFace datasets format'); + print(' - Compatible with datasets library'); + print(' - Automatic schema detection'); + + // ========== FORMAT 5: Image folder ========== + print('\nFormat 5: Image folder structure'); + print(''' + dataset/ + ├── train/ + │ ├── cat/ + │ │ ├── img001.jpg + │ │ └── img002.jpg + │ └── dog/ + │ ├── img001.jpg + │ └── img002.jpg + └── val/ + ├── cat/ + └── dog/ +'''); + + // ========== ALL SUPPORTED FORMATS ========== + print('\nAll supported dataset formats:'); + for (final format in DatasetFormat.values) { + final description = switch (format) { + DatasetFormat.jsonl => 'JSON Lines - one JSON per line (recommended for text)', + DatasetFormat.csv => 'CSV - comma-separated with headers', + DatasetFormat.parquet => 'Parquet - columnar format for large datasets', + DatasetFormat.arrow => 'Apache Arrow - in-memory format', + DatasetFormat.huggingface => 'HuggingFace datasets format', + DatasetFormat.tfrecord => 'TFRecord - TensorFlow format', + DatasetFormat.webdataset => 'WebDataset - PyTorch streaming format', + DatasetFormat.text => 'Plain text - one sample per line', + DatasetFormat.imagefolder => 'Image folder structure', + DatasetFormat.custom => 'Custom binary format', + }; + print(' ${format.value.padRight(15)} - $description'); + } + + // ========== ALL DATASET TYPES ========== + print('\nAll supported dataset types:'); + for (final type in DatasetType.values) { + final description = switch (type) { + DatasetType.textCompletion => 'prompt → completion pairs', + DatasetType.instructionTuning => 'instruction + input → output', + DatasetType.chat => 'multi-turn conversations', + DatasetType.questionAnswering => 'question → answer pairs', + DatasetType.textClassification => 'text → label', + DatasetType.ner => 'named entity recognition', + DatasetType.imageClassification => 'image → label', + DatasetType.objectDetection => 'image → bounding boxes', + DatasetType.imageSegmentation => 'image → mask', + DatasetType.imageText => 'image-text pairs (CLIP, etc.)', + DatasetType.audioTranscription => 'audio → text', + DatasetType.custom => 'custom format', + }; + print(' ${type.value.padRight(22)} - $description'); } - print('\nTraining complete!'); print(''); } diff --git a/sdk/flutter/lib/src/client.dart b/sdk/flutter/lib/src/client.dart index 1ba8563..1b3aa34 100644 --- a/sdk/flutter/lib/src/client.dart +++ b/sdk/flutter/lib/src/client.dart @@ -501,6 +501,164 @@ class SynorCompute { await _delete('/models/$modelId'); } + // ==================== Dataset Management ==================== + + /// Upload a dataset for training. + /// + /// Supports multiple formats: JSONL, CSV, Parquet, Arrow, HuggingFace, etc. + /// + /// Example (JSONL format): + /// ```dart + /// // Create JSONL dataset + /// final jsonlData = ''' + /// {"prompt": "What is 2+2?", "completion": "4"} + /// {"prompt": "Capital of France?", "completion": "Paris"} + /// {"prompt": "Hello", "completion": "Hi there!"} + /// '''; + /// + /// final dataset = await client.uploadDataset( + /// utf8.encode(jsonlData), + /// DatasetUploadOptions( + /// name: 'my-qa-dataset', + /// format: DatasetFormat.jsonl, + /// type: DatasetType.textCompletion, + /// split: DatasetSplit(train: 0.8, validation: 0.1, test: 0.1), + /// ), + /// ); + /// print('Dataset CID: ${dataset.cid}'); + /// print('Total samples: ${dataset.totalSamples}'); + /// ``` + Future uploadDataset( + List data, + DatasetUploadOptions options, + ) async { + _checkDisposed(); + + final uri = Uri.parse('${_config.baseUrl}/datasets/upload'); + final request = http.MultipartRequest('POST', uri) + ..headers.addAll(_headers) + ..fields.addAll(options.toJson().map((k, v) { + if (v is Map || v is List) { + return MapEntry(k, jsonEncode(v)); + } + return MapEntry(k, v.toString()); + })) + ..files.add(http.MultipartFile.fromBytes( + 'dataset', + data, + filename: '${options.name}.${options.format.value}', + )); + + final streamedResponse = await _httpClient.send(request); + final response = await http.Response.fromStream(streamedResponse); + + if (response.statusCode != 200) { + throw SynorException( + 'Dataset upload failed', + statusCode: response.statusCode, + ); + } + + final json = jsonDecode(response.body) as Map; + return DatasetUploadResult.fromJson(json); + } + + /// Upload a dataset from a file path. + Future uploadDatasetFromFile( + String filePath, + DatasetUploadOptions options, + ) async { + _checkDisposed(); + + final uri = Uri.parse('${_config.baseUrl}/datasets/upload'); + final request = http.MultipartRequest('POST', uri) + ..headers.addAll(_headers) + ..fields.addAll(options.toJson().map((k, v) { + if (v is Map || v is List) { + return MapEntry(k, jsonEncode(v)); + } + return MapEntry(k, v.toString()); + })) + ..files.add(await http.MultipartFile.fromPath('dataset', filePath)); + + final streamedResponse = await _httpClient.send(request); + final response = await http.Response.fromStream(streamedResponse); + + if (response.statusCode != 200) { + throw SynorException( + 'Dataset upload failed', + statusCode: response.statusCode, + ); + } + + final json = jsonDecode(response.body) as Map; + return DatasetUploadResult.fromJson(json); + } + + /// List uploaded datasets. + Future> listDatasets({DatasetType? type}) async { + _checkDisposed(); + + final params = { + if (type != null) 'type': type.value, + }; + + final response = await _get('/datasets', params); + final datasets = response['datasets'] as List; + return datasets + .map((d) => DatasetInfo.fromJson(d as Map)) + .toList(); + } + + /// Get dataset info by ID or CID. + Future getDataset(String datasetId) async { + _checkDisposed(); + + final response = await _get('/datasets/$datasetId'); + return DatasetInfo.fromJson(response); + } + + /// Delete a dataset. + Future deleteDataset(String datasetId) async { + _checkDisposed(); + + await _delete('/datasets/$datasetId'); + } + + /// Create a dataset from inline data (convenience method). + /// + /// Example (instruction tuning): + /// ```dart + /// final dataset = await client.createDatasetFromRecords( + /// name: 'instruction-dataset', + /// records: [ + /// {'instruction': 'Summarize:', 'input': 'Long text...', 'output': 'Summary'}, + /// {'instruction': 'Translate:', 'input': 'Hello', 'output': 'Hola'}, + /// ], + /// type: DatasetType.instructionTuning, + /// ); + /// ``` + Future createDatasetFromRecords({ + required String name, + required List> records, + DatasetType type = DatasetType.textCompletion, + DatasetSplit? split, + }) async { + // Convert to JSONL format + final jsonlLines = records.map((r) => jsonEncode(r)).join('\n'); + final data = utf8.encode(jsonlLines); + + return uploadDataset( + data, + DatasetUploadOptions( + name: name, + format: DatasetFormat.jsonl, + type: type, + split: split, + ), + ); + } + // ==================== Training ==================== /// Train a model on a dataset. diff --git a/sdk/flutter/lib/src/types.dart b/sdk/flutter/lib/src/types.dart index c692b91..3c98502 100644 --- a/sdk/flutter/lib/src/types.dart +++ b/sdk/flutter/lib/src/types.dart @@ -595,6 +595,262 @@ class ModelUploadResult { ); } +/// Dataset format for training. +enum DatasetFormat { + /// JSON Lines format - one JSON object per line + jsonl('jsonl'), + /// CSV format with headers + csv('csv'), + /// Parquet columnar format (efficient for large datasets) + parquet('parquet'), + /// Apache Arrow format + arrow('arrow'), + /// HuggingFace datasets format + huggingface('huggingface'), + /// TFRecord format (TensorFlow) + tfrecord('tfrecord'), + /// WebDataset format (PyTorch) + webdataset('webdataset'), + /// Raw text files (one sample per line) + text('text'), + /// Image folder structure + imagefolder('imagefolder'), + /// Custom binary format + custom('custom'); + + const DatasetFormat(this.value); + final String value; + + static DatasetFormat fromString(String s) => + DatasetFormat.values.firstWhere((f) => f.value == s, orElse: () => jsonl); +} + +/// Dataset type/task. +enum DatasetType { + /// Text completion (prompt → completion) + textCompletion('text_completion'), + /// Instruction following (instruction, input, output) + instructionTuning('instruction_tuning'), + /// Chat/conversation format + chat('chat'), + /// Question answering + questionAnswering('question_answering'), + /// Text classification + textClassification('text_classification'), + /// Named entity recognition + ner('ner'), + /// Image classification + imageClassification('image_classification'), + /// Object detection + objectDetection('object_detection'), + /// Image segmentation + imageSegmentation('image_segmentation'), + /// Image-text pairs + imageText('image_text'), + /// Audio transcription + audioTranscription('audio_transcription'), + /// Custom format + custom('custom'); + + const DatasetType(this.value); + final String value; + + static DatasetType fromString(String s) => + DatasetType.values.firstWhere((t) => t.value == s, orElse: () => custom); +} + +/// Dataset upload options. +class DatasetUploadOptions { + /// Dataset name + final String name; + /// Description + final String? description; + /// Dataset format + final DatasetFormat format; + /// Dataset type/task + final DatasetType type; + /// Column mapping (for CSV/Parquet) + final Map? columnMapping; + /// Train/validation/test split ratios + final DatasetSplit? split; + /// Preprocessing options + final DatasetPreprocessing? preprocessing; + /// Is public + final bool isPublic; + + const DatasetUploadOptions({ + required this.name, + this.description, + this.format = DatasetFormat.jsonl, + this.type = DatasetType.textCompletion, + this.columnMapping, + this.split, + this.preprocessing, + this.isPublic = false, + }); + + Map toJson() => { + 'name': name, + if (description != null) 'description': description, + 'format': format.value, + 'type': type.value, + if (columnMapping != null) 'column_mapping': columnMapping, + if (split != null) 'split': split!.toJson(), + if (preprocessing != null) 'preprocessing': preprocessing!.toJson(), + 'is_public': isPublic, + }; +} + +/// Dataset split configuration. +class DatasetSplit { + /// Training set ratio (0.0-1.0) + final double train; + /// Validation set ratio (0.0-1.0) + final double validation; + /// Test set ratio (0.0-1.0) + final double test; + /// Random seed for reproducibility + final int? seed; + + const DatasetSplit({ + this.train = 0.8, + this.validation = 0.1, + this.test = 0.1, + this.seed, + }); + + Map toJson() => { + 'train': train, + 'validation': validation, + 'test': test, + if (seed != null) 'seed': seed, + }; +} + +/// Dataset preprocessing options. +class DatasetPreprocessing { + /// Maximum sequence length (for text) + final int? maxLength; + /// Truncation strategy + final String? truncation; + /// Tokenizer to use + final String? tokenizer; + /// Image size (for vision datasets) + final List? imageSize; + /// Normalize images + final bool? normalizeImages; + /// Shuffle dataset + final bool shuffle; + /// Remove duplicates + final bool deduplicate; + + const DatasetPreprocessing({ + this.maxLength, + this.truncation, + this.tokenizer, + this.imageSize, + this.normalizeImages, + this.shuffle = true, + this.deduplicate = false, + }); + + Map toJson() => { + if (maxLength != null) 'max_length': maxLength, + if (truncation != null) 'truncation': truncation, + if (tokenizer != null) 'tokenizer': tokenizer, + if (imageSize != null) 'image_size': imageSize, + if (normalizeImages != null) 'normalize_images': normalizeImages, + 'shuffle': shuffle, + 'deduplicate': deduplicate, + }; +} + +/// Dataset upload result. +class DatasetUploadResult { + /// Dataset CID (use this for training) + final String cid; + /// Dataset ID + final String datasetId; + /// Total samples + final int totalSamples; + /// Train samples + final int trainSamples; + /// Validation samples + final int validationSamples; + /// Test samples + final int testSamples; + /// Size in bytes + final int sizeBytes; + /// Schema detected + final Map? schema; + + const DatasetUploadResult({ + required this.cid, + required this.datasetId, + required this.totalSamples, + required this.trainSamples, + required this.validationSamples, + required this.testSamples, + required this.sizeBytes, + this.schema, + }); + + factory DatasetUploadResult.fromJson(Map json) => + DatasetUploadResult( + cid: json['cid'] as String, + datasetId: json['dataset_id'] as String, + totalSamples: json['total_samples'] as int, + trainSamples: json['train_samples'] as int, + validationSamples: json['validation_samples'] as int, + testSamples: json['test_samples'] as int, + sizeBytes: json['size_bytes'] as int, + schema: (json['schema'] as Map?)?.cast(), + ); +} + +/// Dataset info. +class DatasetInfo { + final String id; + final String name; + final String? description; + final String cid; + final DatasetFormat format; + final DatasetType type; + final int totalSamples; + final int sizeBytes; + final Map? schema; + final bool isPublic; + final DateTime createdAt; + + const DatasetInfo({ + required this.id, + required this.name, + this.description, + required this.cid, + required this.format, + required this.type, + required this.totalSamples, + required this.sizeBytes, + this.schema, + required this.isPublic, + required this.createdAt, + }); + + factory DatasetInfo.fromJson(Map json) => DatasetInfo( + id: json['id'] as String, + name: json['name'] as String, + description: json['description'] as String?, + cid: json['cid'] as String, + format: DatasetFormat.fromString(json['format'] as String), + type: DatasetType.fromString(json['type'] as String), + totalSamples: json['total_samples'] as int, + sizeBytes: json['size_bytes'] as int, + schema: (json['schema'] as Map?)?.cast(), + isPublic: json['is_public'] as bool? ?? false, + createdAt: DateTime.parse(json['created_at'] as String), + ); +} + /// Training progress update. class TrainingProgress { final String jobId; diff --git a/sdk/flutter/lib/synor_compute.dart b/sdk/flutter/lib/synor_compute.dart index dafc7b3..fcb4b37 100644 --- a/sdk/flutter/lib/synor_compute.dart +++ b/sdk/flutter/lib/synor_compute.dart @@ -94,7 +94,15 @@ export 'src/types.dart' // Training types TrainingOptions, TrainingResult, - TrainingProgress; + TrainingProgress, + // Dataset types + DatasetFormat, + DatasetType, + DatasetUploadOptions, + DatasetSplit, + DatasetPreprocessing, + DatasetUploadResult, + DatasetInfo; export 'src/tensor.dart' show Tensor;