feat(sdk/flutter): add dataset upload APIs and comprehensive examples

Add comprehensive dataset management to the Flutter SDK including:
- Dataset formats: JSONL, CSV, Parquet, Arrow, HuggingFace, TFRecord, WebDataset, Text, ImageFolder, Custom
- Dataset types: text completion, instruction tuning, chat, Q&A, classification, NER, vision, audio
- Upload methods: uploadDataset, uploadDatasetFromFile, createDatasetFromRecords
- Management APIs: listDatasets, getDataset, deleteDataset
- Dataset preprocessing: splitting, shuffling, deduplication, tokenization
- Complete examples showing all formats and use cases
This commit is contained in:
Gulshan Yadav 2026-01-11 16:47:47 +05:30
parent 89fc542da4
commit cb071a7a3b
4 changed files with 604 additions and 25 deletions

View file

@ -1,3 +1,4 @@
import 'dart:convert';
import 'dart:io'; import 'dart:io';
import 'package:synor_compute/synor_compute.dart'; import 'package:synor_compute/synor_compute.dart';
@ -41,6 +42,9 @@ void main() async {
// Example 8: Custom model upload // Example 8: Custom model upload
await customModelExample(client); await customModelExample(client);
// Example 9: Dataset upload formats
await datasetUploadExamples(client);
} finally { } finally {
// Always dispose client to release resources // Always dispose client to release resources
client.dispose(); client.dispose();
@ -237,16 +241,46 @@ Future<void> modelRegistryExample(SynorCompute client) async {
Future<void> trainingExample(SynorCompute client) async { Future<void> trainingExample(SynorCompute client) async {
print('=== Model Training ==='); print('=== Model Training ===');
// Example: Fine-tune Llama 3 8B on custom dataset // ========== STEP 1: Upload your dataset ==========
print('Fine-tuning llama-3-8b on custom dataset...'); print('Step 1: Uploading training dataset...\n');
// Note: In practice, you'd upload your dataset first: // Example 1: JSONL format (most common for LLM fine-tuning)
// final datasetCid = await client.uploadTensor(datasetTensor); final jsonlData = '''
{"prompt": "What is the capital of France?", "completion": "Paris"}
{"prompt": "Translate 'hello' to Spanish", "completion": "hola"}
{"prompt": "What is 2 + 2?", "completion": "4"}
{"prompt": "Who wrote Romeo and Juliet?", "completion": "William Shakespeare"}
''';
final dataset = await client.uploadDataset(
utf8.encode(jsonlData),
DatasetUploadOptions(
name: 'qa-training-data',
description: 'Question-answering training dataset',
format: DatasetFormat.jsonl,
type: DatasetType.textCompletion,
split: DatasetSplit(train: 0.8, validation: 0.1, test: 0.1, seed: 42),
preprocessing: DatasetPreprocessing(
maxLength: 2048,
shuffle: true,
deduplicate: true,
),
),
);
print('Dataset uploaded!');
print(' CID: ${dataset.cid}');
print(' Total samples: ${dataset.totalSamples}');
print(' Train/Val/Test: ${dataset.trainSamples}/${dataset.validationSamples}/${dataset.testSamples}');
print(' Schema: ${dataset.schema}');
// ========== STEP 2: Fine-tune the model ==========
print('\nStep 2: Fine-tuning llama-3-8b on dataset...\n');
final result = await client.fineTune( final result = await client.fineTune(
baseModel: 'llama-3-8b', // Use model alias baseModel: 'llama-3-8b',
datasetCid: 'QmYourDatasetCID', // Your uploaded dataset datasetCid: dataset.cid, // Use the CID from upload
outputAlias: 'my-custom-llama', // Optional: alias for trained model outputAlias: 'my-qa-model',
options: TrainingOptions( options: TrainingOptions(
framework: MlFramework.pytorch, framework: MlFramework.pytorch,
epochs: 3, epochs: 3,
@ -258,7 +292,7 @@ Future<void> trainingExample(SynorCompute client) async {
'warmup_steps': 100, 'warmup_steps': 100,
'gradient_accumulation_steps': 4, 'gradient_accumulation_steps': 4,
}, },
checkpointEvery: 500, // Save checkpoint every 500 steps checkpointEvery: 500,
processor: ProcessorType.gpu, processor: ProcessorType.gpu,
priority: Priority.high, priority: Priority.high,
), ),
@ -271,13 +305,12 @@ Future<void> trainingExample(SynorCompute client) async {
print(' Final loss: ${training.finalLoss.toStringAsFixed(4)}'); print(' Final loss: ${training.finalLoss.toStringAsFixed(4)}');
print(' Duration: ${training.durationMs / 1000}s'); print(' Duration: ${training.durationMs / 1000}s');
print(' Cost: \$${training.cost.toStringAsFixed(4)}'); print(' Cost: \$${training.cost.toStringAsFixed(4)}');
print(' Metrics: ${training.metrics}');
// Now use your trained model for inference // ========== STEP 3: Use your trained model ==========
print('\nUsing trained model for inference:'); print('\nStep 3: Testing trained model...\n');
final inference = await client.inference( final inference = await client.inference(
training.modelCid, // Use the CID of your trained model training.modelCid,
'Hello, how are you?', 'What is the capital of Germany?',
options: InferenceOptions(maxTokens: 50), options: InferenceOptions(maxTokens: 50),
); );
print('Response: ${inference.result}'); print('Response: ${inference.result}');
@ -286,19 +319,143 @@ Future<void> trainingExample(SynorCompute client) async {
} }
print(''); print('');
}
// Example: Streaming training progress /// Dataset upload examples - shows all supported formats
print('Training with streaming progress...'); Future<void> datasetUploadExamples(SynorCompute client) async {
await for (final progress in client.trainStream( print('=== Dataset Upload Examples ===\n');
modelCid: 'llama-3-8b',
datasetCid: 'QmYourDatasetCID', // ========== FORMAT 1: JSONL (JSON Lines) ==========
options: TrainingOptions(epochs: 1, batchSize: 16), print('Format 1: JSONL - One JSON object per line');
)) { print('''
// Update UI with progress // Text completion format
stdout.write('\r${progress.progressText} - ' {"prompt": "Hello", "completion": "Hi there!"}
'${progress.samplesPerSecond} samples/s');
// Instruction tuning format
{"instruction": "Summarize", "input": "Long text...", "output": "Summary"}
// Chat format
{"messages": [{"role": "user", "content": "Hi"}, {"role": "assistant", "content": "Hello!"}]}
''');
// Example: Instruction tuning dataset
final instructionData = await client.createDatasetFromRecords(
name: 'instruction-dataset',
records: [
{
'instruction': 'Summarize the following text',
'input': 'The quick brown fox jumps over the lazy dog.',
'output': 'A fox jumps over a dog.'
},
{
'instruction': 'Translate to French',
'input': 'Hello world',
'output': 'Bonjour le monde'
},
],
type: DatasetType.instructionTuning,
);
print('Instruction dataset CID: ${instructionData.cid}');
// ========== FORMAT 2: CSV ==========
print('\nFormat 2: CSV - Comma-separated values with headers');
print('''
prompt,completion
"What is AI?","Artificial Intelligence is..."
"Define ML","Machine Learning is..."
''');
final csvData = '''
prompt,completion
"What is AI?","Artificial Intelligence is the simulation of human intelligence"
"Define ML","Machine Learning is a subset of AI that learns from data"
''';
final csvDataset = await client.uploadDataset(
utf8.encode(csvData),
DatasetUploadOptions(
name: 'csv-dataset',
format: DatasetFormat.csv,
type: DatasetType.textCompletion,
columnMapping: {'prompt': 'input', 'completion': 'output'},
),
);
print('CSV dataset CID: ${csvDataset.cid}');
// ========== FORMAT 3: Parquet (for large datasets) ==========
print('\nFormat 3: Parquet - Efficient columnar format for large datasets');
print(' - Best for datasets > 1GB');
print(' - Supports compression');
print(' - Fast random access');
print('''
final parquetDataset = await client.uploadDatasetFromFile(
'/path/to/dataset.parquet',
DatasetUploadOptions(
name: 'large-dataset',
format: DatasetFormat.parquet,
type: DatasetType.textCompletion,
),
);
''');
// ========== FORMAT 4: HuggingFace ==========
print('\nFormat 4: HuggingFace datasets format');
print(' - Compatible with datasets library');
print(' - Automatic schema detection');
// ========== FORMAT 5: Image folder ==========
print('\nFormat 5: Image folder structure');
print('''
dataset/
train/
cat/
img001.jpg
img002.jpg
dog/
img001.jpg
img002.jpg
val/
cat/
dog/
''');
// ========== ALL SUPPORTED FORMATS ==========
print('\nAll supported dataset formats:');
for (final format in DatasetFormat.values) {
final description = switch (format) {
DatasetFormat.jsonl => 'JSON Lines - one JSON per line (recommended for text)',
DatasetFormat.csv => 'CSV - comma-separated with headers',
DatasetFormat.parquet => 'Parquet - columnar format for large datasets',
DatasetFormat.arrow => 'Apache Arrow - in-memory format',
DatasetFormat.huggingface => 'HuggingFace datasets format',
DatasetFormat.tfrecord => 'TFRecord - TensorFlow format',
DatasetFormat.webdataset => 'WebDataset - PyTorch streaming format',
DatasetFormat.text => 'Plain text - one sample per line',
DatasetFormat.imagefolder => 'Image folder structure',
DatasetFormat.custom => 'Custom binary format',
};
print(' ${format.value.padRight(15)} - $description');
}
// ========== ALL DATASET TYPES ==========
print('\nAll supported dataset types:');
for (final type in DatasetType.values) {
final description = switch (type) {
DatasetType.textCompletion => 'prompt → completion pairs',
DatasetType.instructionTuning => 'instruction + input → output',
DatasetType.chat => 'multi-turn conversations',
DatasetType.questionAnswering => 'question → answer pairs',
DatasetType.textClassification => 'text → label',
DatasetType.ner => 'named entity recognition',
DatasetType.imageClassification => 'image → label',
DatasetType.objectDetection => 'image → bounding boxes',
DatasetType.imageSegmentation => 'image → mask',
DatasetType.imageText => 'image-text pairs (CLIP, etc.)',
DatasetType.audioTranscription => 'audio → text',
DatasetType.custom => 'custom format',
};
print(' ${type.value.padRight(22)} - $description');
} }
print('\nTraining complete!');
print(''); print('');
} }

View file

@ -501,6 +501,164 @@ class SynorCompute {
await _delete('/models/$modelId'); await _delete('/models/$modelId');
} }
// ==================== Dataset Management ====================
/// Upload a dataset for training.
///
/// Supports multiple formats: JSONL, CSV, Parquet, Arrow, HuggingFace, etc.
///
/// Example (JSONL format):
/// ```dart
/// // Create JSONL dataset
/// final jsonlData = '''
/// {"prompt": "What is 2+2?", "completion": "4"}
/// {"prompt": "Capital of France?", "completion": "Paris"}
/// {"prompt": "Hello", "completion": "Hi there!"}
/// ''';
///
/// final dataset = await client.uploadDataset(
/// utf8.encode(jsonlData),
/// DatasetUploadOptions(
/// name: 'my-qa-dataset',
/// format: DatasetFormat.jsonl,
/// type: DatasetType.textCompletion,
/// split: DatasetSplit(train: 0.8, validation: 0.1, test: 0.1),
/// ),
/// );
/// print('Dataset CID: ${dataset.cid}');
/// print('Total samples: ${dataset.totalSamples}');
/// ```
Future<DatasetUploadResult> uploadDataset(
List<int> data,
DatasetUploadOptions options,
) async {
_checkDisposed();
final uri = Uri.parse('${_config.baseUrl}/datasets/upload');
final request = http.MultipartRequest('POST', uri)
..headers.addAll(_headers)
..fields.addAll(options.toJson().map((k, v) {
if (v is Map || v is List) {
return MapEntry(k, jsonEncode(v));
}
return MapEntry(k, v.toString());
}))
..files.add(http.MultipartFile.fromBytes(
'dataset',
data,
filename: '${options.name}.${options.format.value}',
));
final streamedResponse = await _httpClient.send(request);
final response = await http.Response.fromStream(streamedResponse);
if (response.statusCode != 200) {
throw SynorException(
'Dataset upload failed',
statusCode: response.statusCode,
);
}
final json = jsonDecode(response.body) as Map<String, dynamic>;
return DatasetUploadResult.fromJson(json);
}
/// Upload a dataset from a file path.
Future<DatasetUploadResult> uploadDatasetFromFile(
String filePath,
DatasetUploadOptions options,
) async {
_checkDisposed();
final uri = Uri.parse('${_config.baseUrl}/datasets/upload');
final request = http.MultipartRequest('POST', uri)
..headers.addAll(_headers)
..fields.addAll(options.toJson().map((k, v) {
if (v is Map || v is List) {
return MapEntry(k, jsonEncode(v));
}
return MapEntry(k, v.toString());
}))
..files.add(await http.MultipartFile.fromPath('dataset', filePath));
final streamedResponse = await _httpClient.send(request);
final response = await http.Response.fromStream(streamedResponse);
if (response.statusCode != 200) {
throw SynorException(
'Dataset upload failed',
statusCode: response.statusCode,
);
}
final json = jsonDecode(response.body) as Map<String, dynamic>;
return DatasetUploadResult.fromJson(json);
}
/// List uploaded datasets.
Future<List<DatasetInfo>> listDatasets({DatasetType? type}) async {
_checkDisposed();
final params = <String, String>{
if (type != null) 'type': type.value,
};
final response = await _get('/datasets', params);
final datasets = response['datasets'] as List;
return datasets
.map((d) => DatasetInfo.fromJson(d as Map<String, dynamic>))
.toList();
}
/// Get dataset info by ID or CID.
Future<DatasetInfo> getDataset(String datasetId) async {
_checkDisposed();
final response = await _get('/datasets/$datasetId');
return DatasetInfo.fromJson(response);
}
/// Delete a dataset.
Future<void> deleteDataset(String datasetId) async {
_checkDisposed();
await _delete('/datasets/$datasetId');
}
/// Create a dataset from inline data (convenience method).
///
/// Example (instruction tuning):
/// ```dart
/// final dataset = await client.createDatasetFromRecords(
/// name: 'instruction-dataset',
/// records: [
/// {'instruction': 'Summarize:', 'input': 'Long text...', 'output': 'Summary'},
/// {'instruction': 'Translate:', 'input': 'Hello', 'output': 'Hola'},
/// ],
/// type: DatasetType.instructionTuning,
/// );
/// ```
Future<DatasetUploadResult> createDatasetFromRecords({
required String name,
required List<Map<String, dynamic>> records,
DatasetType type = DatasetType.textCompletion,
DatasetSplit? split,
}) async {
// Convert to JSONL format
final jsonlLines = records.map((r) => jsonEncode(r)).join('\n');
final data = utf8.encode(jsonlLines);
return uploadDataset(
data,
DatasetUploadOptions(
name: name,
format: DatasetFormat.jsonl,
type: type,
split: split,
),
);
}
// ==================== Training ==================== // ==================== Training ====================
/// Train a model on a dataset. /// Train a model on a dataset.

View file

@ -595,6 +595,262 @@ class ModelUploadResult {
); );
} }
/// Dataset format for training.
enum DatasetFormat {
/// JSON Lines format - one JSON object per line
jsonl('jsonl'),
/// CSV format with headers
csv('csv'),
/// Parquet columnar format (efficient for large datasets)
parquet('parquet'),
/// Apache Arrow format
arrow('arrow'),
/// HuggingFace datasets format
huggingface('huggingface'),
/// TFRecord format (TensorFlow)
tfrecord('tfrecord'),
/// WebDataset format (PyTorch)
webdataset('webdataset'),
/// Raw text files (one sample per line)
text('text'),
/// Image folder structure
imagefolder('imagefolder'),
/// Custom binary format
custom('custom');
const DatasetFormat(this.value);
final String value;
static DatasetFormat fromString(String s) =>
DatasetFormat.values.firstWhere((f) => f.value == s, orElse: () => jsonl);
}
/// Dataset type/task.
enum DatasetType {
/// Text completion (prompt completion)
textCompletion('text_completion'),
/// Instruction following (instruction, input, output)
instructionTuning('instruction_tuning'),
/// Chat/conversation format
chat('chat'),
/// Question answering
questionAnswering('question_answering'),
/// Text classification
textClassification('text_classification'),
/// Named entity recognition
ner('ner'),
/// Image classification
imageClassification('image_classification'),
/// Object detection
objectDetection('object_detection'),
/// Image segmentation
imageSegmentation('image_segmentation'),
/// Image-text pairs
imageText('image_text'),
/// Audio transcription
audioTranscription('audio_transcription'),
/// Custom format
custom('custom');
const DatasetType(this.value);
final String value;
static DatasetType fromString(String s) =>
DatasetType.values.firstWhere((t) => t.value == s, orElse: () => custom);
}
/// Dataset upload options.
class DatasetUploadOptions {
/// Dataset name
final String name;
/// Description
final String? description;
/// Dataset format
final DatasetFormat format;
/// Dataset type/task
final DatasetType type;
/// Column mapping (for CSV/Parquet)
final Map<String, String>? columnMapping;
/// Train/validation/test split ratios
final DatasetSplit? split;
/// Preprocessing options
final DatasetPreprocessing? preprocessing;
/// Is public
final bool isPublic;
const DatasetUploadOptions({
required this.name,
this.description,
this.format = DatasetFormat.jsonl,
this.type = DatasetType.textCompletion,
this.columnMapping,
this.split,
this.preprocessing,
this.isPublic = false,
});
Map<String, dynamic> toJson() => {
'name': name,
if (description != null) 'description': description,
'format': format.value,
'type': type.value,
if (columnMapping != null) 'column_mapping': columnMapping,
if (split != null) 'split': split!.toJson(),
if (preprocessing != null) 'preprocessing': preprocessing!.toJson(),
'is_public': isPublic,
};
}
/// Dataset split configuration.
class DatasetSplit {
/// Training set ratio (0.0-1.0)
final double train;
/// Validation set ratio (0.0-1.0)
final double validation;
/// Test set ratio (0.0-1.0)
final double test;
/// Random seed for reproducibility
final int? seed;
const DatasetSplit({
this.train = 0.8,
this.validation = 0.1,
this.test = 0.1,
this.seed,
});
Map<String, dynamic> toJson() => {
'train': train,
'validation': validation,
'test': test,
if (seed != null) 'seed': seed,
};
}
/// Dataset preprocessing options.
class DatasetPreprocessing {
/// Maximum sequence length (for text)
final int? maxLength;
/// Truncation strategy
final String? truncation;
/// Tokenizer to use
final String? tokenizer;
/// Image size (for vision datasets)
final List<int>? imageSize;
/// Normalize images
final bool? normalizeImages;
/// Shuffle dataset
final bool shuffle;
/// Remove duplicates
final bool deduplicate;
const DatasetPreprocessing({
this.maxLength,
this.truncation,
this.tokenizer,
this.imageSize,
this.normalizeImages,
this.shuffle = true,
this.deduplicate = false,
});
Map<String, dynamic> toJson() => {
if (maxLength != null) 'max_length': maxLength,
if (truncation != null) 'truncation': truncation,
if (tokenizer != null) 'tokenizer': tokenizer,
if (imageSize != null) 'image_size': imageSize,
if (normalizeImages != null) 'normalize_images': normalizeImages,
'shuffle': shuffle,
'deduplicate': deduplicate,
};
}
/// Dataset upload result.
class DatasetUploadResult {
/// Dataset CID (use this for training)
final String cid;
/// Dataset ID
final String datasetId;
/// Total samples
final int totalSamples;
/// Train samples
final int trainSamples;
/// Validation samples
final int validationSamples;
/// Test samples
final int testSamples;
/// Size in bytes
final int sizeBytes;
/// Schema detected
final Map<String, String>? schema;
const DatasetUploadResult({
required this.cid,
required this.datasetId,
required this.totalSamples,
required this.trainSamples,
required this.validationSamples,
required this.testSamples,
required this.sizeBytes,
this.schema,
});
factory DatasetUploadResult.fromJson(Map<String, dynamic> json) =>
DatasetUploadResult(
cid: json['cid'] as String,
datasetId: json['dataset_id'] as String,
totalSamples: json['total_samples'] as int,
trainSamples: json['train_samples'] as int,
validationSamples: json['validation_samples'] as int,
testSamples: json['test_samples'] as int,
sizeBytes: json['size_bytes'] as int,
schema: (json['schema'] as Map<String, dynamic>?)?.cast<String, String>(),
);
}
/// Dataset info.
class DatasetInfo {
final String id;
final String name;
final String? description;
final String cid;
final DatasetFormat format;
final DatasetType type;
final int totalSamples;
final int sizeBytes;
final Map<String, String>? schema;
final bool isPublic;
final DateTime createdAt;
const DatasetInfo({
required this.id,
required this.name,
this.description,
required this.cid,
required this.format,
required this.type,
required this.totalSamples,
required this.sizeBytes,
this.schema,
required this.isPublic,
required this.createdAt,
});
factory DatasetInfo.fromJson(Map<String, dynamic> json) => DatasetInfo(
id: json['id'] as String,
name: json['name'] as String,
description: json['description'] as String?,
cid: json['cid'] as String,
format: DatasetFormat.fromString(json['format'] as String),
type: DatasetType.fromString(json['type'] as String),
totalSamples: json['total_samples'] as int,
sizeBytes: json['size_bytes'] as int,
schema: (json['schema'] as Map<String, dynamic>?)?.cast<String, String>(),
isPublic: json['is_public'] as bool? ?? false,
createdAt: DateTime.parse(json['created_at'] as String),
);
}
/// Training progress update. /// Training progress update.
class TrainingProgress { class TrainingProgress {
final String jobId; final String jobId;

View file

@ -94,7 +94,15 @@ export 'src/types.dart'
// Training types // Training types
TrainingOptions, TrainingOptions,
TrainingResult, TrainingResult,
TrainingProgress; TrainingProgress,
// Dataset types
DatasetFormat,
DatasetType,
DatasetUploadOptions,
DatasetSplit,
DatasetPreprocessing,
DatasetUploadResult,
DatasetInfo;
export 'src/tensor.dart' show Tensor; export 'src/tensor.dart' show Tensor;