feat(sdk/flutter): add dataset upload APIs and comprehensive examples
Add comprehensive dataset management to the Flutter SDK including: - Dataset formats: JSONL, CSV, Parquet, Arrow, HuggingFace, TFRecord, WebDataset, Text, ImageFolder, Custom - Dataset types: text completion, instruction tuning, chat, Q&A, classification, NER, vision, audio - Upload methods: uploadDataset, uploadDatasetFromFile, createDatasetFromRecords - Management APIs: listDatasets, getDataset, deleteDataset - Dataset preprocessing: splitting, shuffling, deduplication, tokenization - Complete examples showing all formats and use cases
This commit is contained in:
parent
89fc542da4
commit
cb071a7a3b
4 changed files with 604 additions and 25 deletions
|
|
@ -1,3 +1,4 @@
|
||||||
|
import 'dart:convert';
|
||||||
import 'dart:io';
|
import 'dart:io';
|
||||||
|
|
||||||
import 'package:synor_compute/synor_compute.dart';
|
import 'package:synor_compute/synor_compute.dart';
|
||||||
|
|
@ -41,6 +42,9 @@ void main() async {
|
||||||
|
|
||||||
// Example 8: Custom model upload
|
// Example 8: Custom model upload
|
||||||
await customModelExample(client);
|
await customModelExample(client);
|
||||||
|
|
||||||
|
// Example 9: Dataset upload formats
|
||||||
|
await datasetUploadExamples(client);
|
||||||
} finally {
|
} finally {
|
||||||
// Always dispose client to release resources
|
// Always dispose client to release resources
|
||||||
client.dispose();
|
client.dispose();
|
||||||
|
|
@ -237,16 +241,46 @@ Future<void> modelRegistryExample(SynorCompute client) async {
|
||||||
Future<void> trainingExample(SynorCompute client) async {
|
Future<void> trainingExample(SynorCompute client) async {
|
||||||
print('=== Model Training ===');
|
print('=== Model Training ===');
|
||||||
|
|
||||||
// Example: Fine-tune Llama 3 8B on custom dataset
|
// ========== STEP 1: Upload your dataset ==========
|
||||||
print('Fine-tuning llama-3-8b on custom dataset...');
|
print('Step 1: Uploading training dataset...\n');
|
||||||
|
|
||||||
// Note: In practice, you'd upload your dataset first:
|
// Example 1: JSONL format (most common for LLM fine-tuning)
|
||||||
// final datasetCid = await client.uploadTensor(datasetTensor);
|
final jsonlData = '''
|
||||||
|
{"prompt": "What is the capital of France?", "completion": "Paris"}
|
||||||
|
{"prompt": "Translate 'hello' to Spanish", "completion": "hola"}
|
||||||
|
{"prompt": "What is 2 + 2?", "completion": "4"}
|
||||||
|
{"prompt": "Who wrote Romeo and Juliet?", "completion": "William Shakespeare"}
|
||||||
|
''';
|
||||||
|
|
||||||
|
final dataset = await client.uploadDataset(
|
||||||
|
utf8.encode(jsonlData),
|
||||||
|
DatasetUploadOptions(
|
||||||
|
name: 'qa-training-data',
|
||||||
|
description: 'Question-answering training dataset',
|
||||||
|
format: DatasetFormat.jsonl,
|
||||||
|
type: DatasetType.textCompletion,
|
||||||
|
split: DatasetSplit(train: 0.8, validation: 0.1, test: 0.1, seed: 42),
|
||||||
|
preprocessing: DatasetPreprocessing(
|
||||||
|
maxLength: 2048,
|
||||||
|
shuffle: true,
|
||||||
|
deduplicate: true,
|
||||||
|
),
|
||||||
|
),
|
||||||
|
);
|
||||||
|
|
||||||
|
print('Dataset uploaded!');
|
||||||
|
print(' CID: ${dataset.cid}');
|
||||||
|
print(' Total samples: ${dataset.totalSamples}');
|
||||||
|
print(' Train/Val/Test: ${dataset.trainSamples}/${dataset.validationSamples}/${dataset.testSamples}');
|
||||||
|
print(' Schema: ${dataset.schema}');
|
||||||
|
|
||||||
|
// ========== STEP 2: Fine-tune the model ==========
|
||||||
|
print('\nStep 2: Fine-tuning llama-3-8b on dataset...\n');
|
||||||
|
|
||||||
final result = await client.fineTune(
|
final result = await client.fineTune(
|
||||||
baseModel: 'llama-3-8b', // Use model alias
|
baseModel: 'llama-3-8b',
|
||||||
datasetCid: 'QmYourDatasetCID', // Your uploaded dataset
|
datasetCid: dataset.cid, // Use the CID from upload
|
||||||
outputAlias: 'my-custom-llama', // Optional: alias for trained model
|
outputAlias: 'my-qa-model',
|
||||||
options: TrainingOptions(
|
options: TrainingOptions(
|
||||||
framework: MlFramework.pytorch,
|
framework: MlFramework.pytorch,
|
||||||
epochs: 3,
|
epochs: 3,
|
||||||
|
|
@ -258,7 +292,7 @@ Future<void> trainingExample(SynorCompute client) async {
|
||||||
'warmup_steps': 100,
|
'warmup_steps': 100,
|
||||||
'gradient_accumulation_steps': 4,
|
'gradient_accumulation_steps': 4,
|
||||||
},
|
},
|
||||||
checkpointEvery: 500, // Save checkpoint every 500 steps
|
checkpointEvery: 500,
|
||||||
processor: ProcessorType.gpu,
|
processor: ProcessorType.gpu,
|
||||||
priority: Priority.high,
|
priority: Priority.high,
|
||||||
),
|
),
|
||||||
|
|
@ -271,13 +305,12 @@ Future<void> trainingExample(SynorCompute client) async {
|
||||||
print(' Final loss: ${training.finalLoss.toStringAsFixed(4)}');
|
print(' Final loss: ${training.finalLoss.toStringAsFixed(4)}');
|
||||||
print(' Duration: ${training.durationMs / 1000}s');
|
print(' Duration: ${training.durationMs / 1000}s');
|
||||||
print(' Cost: \$${training.cost.toStringAsFixed(4)}');
|
print(' Cost: \$${training.cost.toStringAsFixed(4)}');
|
||||||
print(' Metrics: ${training.metrics}');
|
|
||||||
|
|
||||||
// Now use your trained model for inference
|
// ========== STEP 3: Use your trained model ==========
|
||||||
print('\nUsing trained model for inference:');
|
print('\nStep 3: Testing trained model...\n');
|
||||||
final inference = await client.inference(
|
final inference = await client.inference(
|
||||||
training.modelCid, // Use the CID of your trained model
|
training.modelCid,
|
||||||
'Hello, how are you?',
|
'What is the capital of Germany?',
|
||||||
options: InferenceOptions(maxTokens: 50),
|
options: InferenceOptions(maxTokens: 50),
|
||||||
);
|
);
|
||||||
print('Response: ${inference.result}');
|
print('Response: ${inference.result}');
|
||||||
|
|
@ -286,19 +319,143 @@ Future<void> trainingExample(SynorCompute client) async {
|
||||||
}
|
}
|
||||||
|
|
||||||
print('');
|
print('');
|
||||||
|
|
||||||
// Example: Streaming training progress
|
|
||||||
print('Training with streaming progress...');
|
|
||||||
await for (final progress in client.trainStream(
|
|
||||||
modelCid: 'llama-3-8b',
|
|
||||||
datasetCid: 'QmYourDatasetCID',
|
|
||||||
options: TrainingOptions(epochs: 1, batchSize: 16),
|
|
||||||
)) {
|
|
||||||
// Update UI with progress
|
|
||||||
stdout.write('\r${progress.progressText} - '
|
|
||||||
'${progress.samplesPerSecond} samples/s');
|
|
||||||
}
|
}
|
||||||
print('\nTraining complete!');
|
|
||||||
|
/// Dataset upload examples - shows all supported formats
|
||||||
|
Future<void> datasetUploadExamples(SynorCompute client) async {
|
||||||
|
print('=== Dataset Upload Examples ===\n');
|
||||||
|
|
||||||
|
// ========== FORMAT 1: JSONL (JSON Lines) ==========
|
||||||
|
print('Format 1: JSONL - One JSON object per line');
|
||||||
|
print('''
|
||||||
|
// Text completion format
|
||||||
|
{"prompt": "Hello", "completion": "Hi there!"}
|
||||||
|
|
||||||
|
// Instruction tuning format
|
||||||
|
{"instruction": "Summarize", "input": "Long text...", "output": "Summary"}
|
||||||
|
|
||||||
|
// Chat format
|
||||||
|
{"messages": [{"role": "user", "content": "Hi"}, {"role": "assistant", "content": "Hello!"}]}
|
||||||
|
''');
|
||||||
|
|
||||||
|
// Example: Instruction tuning dataset
|
||||||
|
final instructionData = await client.createDatasetFromRecords(
|
||||||
|
name: 'instruction-dataset',
|
||||||
|
records: [
|
||||||
|
{
|
||||||
|
'instruction': 'Summarize the following text',
|
||||||
|
'input': 'The quick brown fox jumps over the lazy dog.',
|
||||||
|
'output': 'A fox jumps over a dog.'
|
||||||
|
},
|
||||||
|
{
|
||||||
|
'instruction': 'Translate to French',
|
||||||
|
'input': 'Hello world',
|
||||||
|
'output': 'Bonjour le monde'
|
||||||
|
},
|
||||||
|
],
|
||||||
|
type: DatasetType.instructionTuning,
|
||||||
|
);
|
||||||
|
print('Instruction dataset CID: ${instructionData.cid}');
|
||||||
|
|
||||||
|
// ========== FORMAT 2: CSV ==========
|
||||||
|
print('\nFormat 2: CSV - Comma-separated values with headers');
|
||||||
|
print('''
|
||||||
|
prompt,completion
|
||||||
|
"What is AI?","Artificial Intelligence is..."
|
||||||
|
"Define ML","Machine Learning is..."
|
||||||
|
''');
|
||||||
|
|
||||||
|
final csvData = '''
|
||||||
|
prompt,completion
|
||||||
|
"What is AI?","Artificial Intelligence is the simulation of human intelligence"
|
||||||
|
"Define ML","Machine Learning is a subset of AI that learns from data"
|
||||||
|
''';
|
||||||
|
|
||||||
|
final csvDataset = await client.uploadDataset(
|
||||||
|
utf8.encode(csvData),
|
||||||
|
DatasetUploadOptions(
|
||||||
|
name: 'csv-dataset',
|
||||||
|
format: DatasetFormat.csv,
|
||||||
|
type: DatasetType.textCompletion,
|
||||||
|
columnMapping: {'prompt': 'input', 'completion': 'output'},
|
||||||
|
),
|
||||||
|
);
|
||||||
|
print('CSV dataset CID: ${csvDataset.cid}');
|
||||||
|
|
||||||
|
// ========== FORMAT 3: Parquet (for large datasets) ==========
|
||||||
|
print('\nFormat 3: Parquet - Efficient columnar format for large datasets');
|
||||||
|
print(' - Best for datasets > 1GB');
|
||||||
|
print(' - Supports compression');
|
||||||
|
print(' - Fast random access');
|
||||||
|
print('''
|
||||||
|
final parquetDataset = await client.uploadDatasetFromFile(
|
||||||
|
'/path/to/dataset.parquet',
|
||||||
|
DatasetUploadOptions(
|
||||||
|
name: 'large-dataset',
|
||||||
|
format: DatasetFormat.parquet,
|
||||||
|
type: DatasetType.textCompletion,
|
||||||
|
),
|
||||||
|
);
|
||||||
|
''');
|
||||||
|
|
||||||
|
// ========== FORMAT 4: HuggingFace ==========
|
||||||
|
print('\nFormat 4: HuggingFace datasets format');
|
||||||
|
print(' - Compatible with datasets library');
|
||||||
|
print(' - Automatic schema detection');
|
||||||
|
|
||||||
|
// ========== FORMAT 5: Image folder ==========
|
||||||
|
print('\nFormat 5: Image folder structure');
|
||||||
|
print('''
|
||||||
|
dataset/
|
||||||
|
├── train/
|
||||||
|
│ ├── cat/
|
||||||
|
│ │ ├── img001.jpg
|
||||||
|
│ │ └── img002.jpg
|
||||||
|
│ └── dog/
|
||||||
|
│ ├── img001.jpg
|
||||||
|
│ └── img002.jpg
|
||||||
|
└── val/
|
||||||
|
├── cat/
|
||||||
|
└── dog/
|
||||||
|
''');
|
||||||
|
|
||||||
|
// ========== ALL SUPPORTED FORMATS ==========
|
||||||
|
print('\nAll supported dataset formats:');
|
||||||
|
for (final format in DatasetFormat.values) {
|
||||||
|
final description = switch (format) {
|
||||||
|
DatasetFormat.jsonl => 'JSON Lines - one JSON per line (recommended for text)',
|
||||||
|
DatasetFormat.csv => 'CSV - comma-separated with headers',
|
||||||
|
DatasetFormat.parquet => 'Parquet - columnar format for large datasets',
|
||||||
|
DatasetFormat.arrow => 'Apache Arrow - in-memory format',
|
||||||
|
DatasetFormat.huggingface => 'HuggingFace datasets format',
|
||||||
|
DatasetFormat.tfrecord => 'TFRecord - TensorFlow format',
|
||||||
|
DatasetFormat.webdataset => 'WebDataset - PyTorch streaming format',
|
||||||
|
DatasetFormat.text => 'Plain text - one sample per line',
|
||||||
|
DatasetFormat.imagefolder => 'Image folder structure',
|
||||||
|
DatasetFormat.custom => 'Custom binary format',
|
||||||
|
};
|
||||||
|
print(' ${format.value.padRight(15)} - $description');
|
||||||
|
}
|
||||||
|
|
||||||
|
// ========== ALL DATASET TYPES ==========
|
||||||
|
print('\nAll supported dataset types:');
|
||||||
|
for (final type in DatasetType.values) {
|
||||||
|
final description = switch (type) {
|
||||||
|
DatasetType.textCompletion => 'prompt → completion pairs',
|
||||||
|
DatasetType.instructionTuning => 'instruction + input → output',
|
||||||
|
DatasetType.chat => 'multi-turn conversations',
|
||||||
|
DatasetType.questionAnswering => 'question → answer pairs',
|
||||||
|
DatasetType.textClassification => 'text → label',
|
||||||
|
DatasetType.ner => 'named entity recognition',
|
||||||
|
DatasetType.imageClassification => 'image → label',
|
||||||
|
DatasetType.objectDetection => 'image → bounding boxes',
|
||||||
|
DatasetType.imageSegmentation => 'image → mask',
|
||||||
|
DatasetType.imageText => 'image-text pairs (CLIP, etc.)',
|
||||||
|
DatasetType.audioTranscription => 'audio → text',
|
||||||
|
DatasetType.custom => 'custom format',
|
||||||
|
};
|
||||||
|
print(' ${type.value.padRight(22)} - $description');
|
||||||
|
}
|
||||||
|
|
||||||
print('');
|
print('');
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -501,6 +501,164 @@ class SynorCompute {
|
||||||
await _delete('/models/$modelId');
|
await _delete('/models/$modelId');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ==================== Dataset Management ====================
|
||||||
|
|
||||||
|
/// Upload a dataset for training.
|
||||||
|
///
|
||||||
|
/// Supports multiple formats: JSONL, CSV, Parquet, Arrow, HuggingFace, etc.
|
||||||
|
///
|
||||||
|
/// Example (JSONL format):
|
||||||
|
/// ```dart
|
||||||
|
/// // Create JSONL dataset
|
||||||
|
/// final jsonlData = '''
|
||||||
|
/// {"prompt": "What is 2+2?", "completion": "4"}
|
||||||
|
/// {"prompt": "Capital of France?", "completion": "Paris"}
|
||||||
|
/// {"prompt": "Hello", "completion": "Hi there!"}
|
||||||
|
/// ''';
|
||||||
|
///
|
||||||
|
/// final dataset = await client.uploadDataset(
|
||||||
|
/// utf8.encode(jsonlData),
|
||||||
|
/// DatasetUploadOptions(
|
||||||
|
/// name: 'my-qa-dataset',
|
||||||
|
/// format: DatasetFormat.jsonl,
|
||||||
|
/// type: DatasetType.textCompletion,
|
||||||
|
/// split: DatasetSplit(train: 0.8, validation: 0.1, test: 0.1),
|
||||||
|
/// ),
|
||||||
|
/// );
|
||||||
|
/// print('Dataset CID: ${dataset.cid}');
|
||||||
|
/// print('Total samples: ${dataset.totalSamples}');
|
||||||
|
/// ```
|
||||||
|
Future<DatasetUploadResult> uploadDataset(
|
||||||
|
List<int> data,
|
||||||
|
DatasetUploadOptions options,
|
||||||
|
) async {
|
||||||
|
_checkDisposed();
|
||||||
|
|
||||||
|
final uri = Uri.parse('${_config.baseUrl}/datasets/upload');
|
||||||
|
final request = http.MultipartRequest('POST', uri)
|
||||||
|
..headers.addAll(_headers)
|
||||||
|
..fields.addAll(options.toJson().map((k, v) {
|
||||||
|
if (v is Map || v is List) {
|
||||||
|
return MapEntry(k, jsonEncode(v));
|
||||||
|
}
|
||||||
|
return MapEntry(k, v.toString());
|
||||||
|
}))
|
||||||
|
..files.add(http.MultipartFile.fromBytes(
|
||||||
|
'dataset',
|
||||||
|
data,
|
||||||
|
filename: '${options.name}.${options.format.value}',
|
||||||
|
));
|
||||||
|
|
||||||
|
final streamedResponse = await _httpClient.send(request);
|
||||||
|
final response = await http.Response.fromStream(streamedResponse);
|
||||||
|
|
||||||
|
if (response.statusCode != 200) {
|
||||||
|
throw SynorException(
|
||||||
|
'Dataset upload failed',
|
||||||
|
statusCode: response.statusCode,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
final json = jsonDecode(response.body) as Map<String, dynamic>;
|
||||||
|
return DatasetUploadResult.fromJson(json);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Upload a dataset from a file path.
|
||||||
|
Future<DatasetUploadResult> uploadDatasetFromFile(
|
||||||
|
String filePath,
|
||||||
|
DatasetUploadOptions options,
|
||||||
|
) async {
|
||||||
|
_checkDisposed();
|
||||||
|
|
||||||
|
final uri = Uri.parse('${_config.baseUrl}/datasets/upload');
|
||||||
|
final request = http.MultipartRequest('POST', uri)
|
||||||
|
..headers.addAll(_headers)
|
||||||
|
..fields.addAll(options.toJson().map((k, v) {
|
||||||
|
if (v is Map || v is List) {
|
||||||
|
return MapEntry(k, jsonEncode(v));
|
||||||
|
}
|
||||||
|
return MapEntry(k, v.toString());
|
||||||
|
}))
|
||||||
|
..files.add(await http.MultipartFile.fromPath('dataset', filePath));
|
||||||
|
|
||||||
|
final streamedResponse = await _httpClient.send(request);
|
||||||
|
final response = await http.Response.fromStream(streamedResponse);
|
||||||
|
|
||||||
|
if (response.statusCode != 200) {
|
||||||
|
throw SynorException(
|
||||||
|
'Dataset upload failed',
|
||||||
|
statusCode: response.statusCode,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
final json = jsonDecode(response.body) as Map<String, dynamic>;
|
||||||
|
return DatasetUploadResult.fromJson(json);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// List uploaded datasets.
|
||||||
|
Future<List<DatasetInfo>> listDatasets({DatasetType? type}) async {
|
||||||
|
_checkDisposed();
|
||||||
|
|
||||||
|
final params = <String, String>{
|
||||||
|
if (type != null) 'type': type.value,
|
||||||
|
};
|
||||||
|
|
||||||
|
final response = await _get('/datasets', params);
|
||||||
|
final datasets = response['datasets'] as List;
|
||||||
|
return datasets
|
||||||
|
.map((d) => DatasetInfo.fromJson(d as Map<String, dynamic>))
|
||||||
|
.toList();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Get dataset info by ID or CID.
|
||||||
|
Future<DatasetInfo> getDataset(String datasetId) async {
|
||||||
|
_checkDisposed();
|
||||||
|
|
||||||
|
final response = await _get('/datasets/$datasetId');
|
||||||
|
return DatasetInfo.fromJson(response);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Delete a dataset.
|
||||||
|
Future<void> deleteDataset(String datasetId) async {
|
||||||
|
_checkDisposed();
|
||||||
|
|
||||||
|
await _delete('/datasets/$datasetId');
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create a dataset from inline data (convenience method).
|
||||||
|
///
|
||||||
|
/// Example (instruction tuning):
|
||||||
|
/// ```dart
|
||||||
|
/// final dataset = await client.createDatasetFromRecords(
|
||||||
|
/// name: 'instruction-dataset',
|
||||||
|
/// records: [
|
||||||
|
/// {'instruction': 'Summarize:', 'input': 'Long text...', 'output': 'Summary'},
|
||||||
|
/// {'instruction': 'Translate:', 'input': 'Hello', 'output': 'Hola'},
|
||||||
|
/// ],
|
||||||
|
/// type: DatasetType.instructionTuning,
|
||||||
|
/// );
|
||||||
|
/// ```
|
||||||
|
Future<DatasetUploadResult> createDatasetFromRecords({
|
||||||
|
required String name,
|
||||||
|
required List<Map<String, dynamic>> records,
|
||||||
|
DatasetType type = DatasetType.textCompletion,
|
||||||
|
DatasetSplit? split,
|
||||||
|
}) async {
|
||||||
|
// Convert to JSONL format
|
||||||
|
final jsonlLines = records.map((r) => jsonEncode(r)).join('\n');
|
||||||
|
final data = utf8.encode(jsonlLines);
|
||||||
|
|
||||||
|
return uploadDataset(
|
||||||
|
data,
|
||||||
|
DatasetUploadOptions(
|
||||||
|
name: name,
|
||||||
|
format: DatasetFormat.jsonl,
|
||||||
|
type: type,
|
||||||
|
split: split,
|
||||||
|
),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
// ==================== Training ====================
|
// ==================== Training ====================
|
||||||
|
|
||||||
/// Train a model on a dataset.
|
/// Train a model on a dataset.
|
||||||
|
|
|
||||||
|
|
@ -595,6 +595,262 @@ class ModelUploadResult {
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Dataset format for training.
|
||||||
|
enum DatasetFormat {
|
||||||
|
/// JSON Lines format - one JSON object per line
|
||||||
|
jsonl('jsonl'),
|
||||||
|
/// CSV format with headers
|
||||||
|
csv('csv'),
|
||||||
|
/// Parquet columnar format (efficient for large datasets)
|
||||||
|
parquet('parquet'),
|
||||||
|
/// Apache Arrow format
|
||||||
|
arrow('arrow'),
|
||||||
|
/// HuggingFace datasets format
|
||||||
|
huggingface('huggingface'),
|
||||||
|
/// TFRecord format (TensorFlow)
|
||||||
|
tfrecord('tfrecord'),
|
||||||
|
/// WebDataset format (PyTorch)
|
||||||
|
webdataset('webdataset'),
|
||||||
|
/// Raw text files (one sample per line)
|
||||||
|
text('text'),
|
||||||
|
/// Image folder structure
|
||||||
|
imagefolder('imagefolder'),
|
||||||
|
/// Custom binary format
|
||||||
|
custom('custom');
|
||||||
|
|
||||||
|
const DatasetFormat(this.value);
|
||||||
|
final String value;
|
||||||
|
|
||||||
|
static DatasetFormat fromString(String s) =>
|
||||||
|
DatasetFormat.values.firstWhere((f) => f.value == s, orElse: () => jsonl);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Dataset type/task.
|
||||||
|
enum DatasetType {
|
||||||
|
/// Text completion (prompt → completion)
|
||||||
|
textCompletion('text_completion'),
|
||||||
|
/// Instruction following (instruction, input, output)
|
||||||
|
instructionTuning('instruction_tuning'),
|
||||||
|
/// Chat/conversation format
|
||||||
|
chat('chat'),
|
||||||
|
/// Question answering
|
||||||
|
questionAnswering('question_answering'),
|
||||||
|
/// Text classification
|
||||||
|
textClassification('text_classification'),
|
||||||
|
/// Named entity recognition
|
||||||
|
ner('ner'),
|
||||||
|
/// Image classification
|
||||||
|
imageClassification('image_classification'),
|
||||||
|
/// Object detection
|
||||||
|
objectDetection('object_detection'),
|
||||||
|
/// Image segmentation
|
||||||
|
imageSegmentation('image_segmentation'),
|
||||||
|
/// Image-text pairs
|
||||||
|
imageText('image_text'),
|
||||||
|
/// Audio transcription
|
||||||
|
audioTranscription('audio_transcription'),
|
||||||
|
/// Custom format
|
||||||
|
custom('custom');
|
||||||
|
|
||||||
|
const DatasetType(this.value);
|
||||||
|
final String value;
|
||||||
|
|
||||||
|
static DatasetType fromString(String s) =>
|
||||||
|
DatasetType.values.firstWhere((t) => t.value == s, orElse: () => custom);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Dataset upload options.
|
||||||
|
class DatasetUploadOptions {
|
||||||
|
/// Dataset name
|
||||||
|
final String name;
|
||||||
|
/// Description
|
||||||
|
final String? description;
|
||||||
|
/// Dataset format
|
||||||
|
final DatasetFormat format;
|
||||||
|
/// Dataset type/task
|
||||||
|
final DatasetType type;
|
||||||
|
/// Column mapping (for CSV/Parquet)
|
||||||
|
final Map<String, String>? columnMapping;
|
||||||
|
/// Train/validation/test split ratios
|
||||||
|
final DatasetSplit? split;
|
||||||
|
/// Preprocessing options
|
||||||
|
final DatasetPreprocessing? preprocessing;
|
||||||
|
/// Is public
|
||||||
|
final bool isPublic;
|
||||||
|
|
||||||
|
const DatasetUploadOptions({
|
||||||
|
required this.name,
|
||||||
|
this.description,
|
||||||
|
this.format = DatasetFormat.jsonl,
|
||||||
|
this.type = DatasetType.textCompletion,
|
||||||
|
this.columnMapping,
|
||||||
|
this.split,
|
||||||
|
this.preprocessing,
|
||||||
|
this.isPublic = false,
|
||||||
|
});
|
||||||
|
|
||||||
|
Map<String, dynamic> toJson() => {
|
||||||
|
'name': name,
|
||||||
|
if (description != null) 'description': description,
|
||||||
|
'format': format.value,
|
||||||
|
'type': type.value,
|
||||||
|
if (columnMapping != null) 'column_mapping': columnMapping,
|
||||||
|
if (split != null) 'split': split!.toJson(),
|
||||||
|
if (preprocessing != null) 'preprocessing': preprocessing!.toJson(),
|
||||||
|
'is_public': isPublic,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Dataset split configuration.
|
||||||
|
class DatasetSplit {
|
||||||
|
/// Training set ratio (0.0-1.0)
|
||||||
|
final double train;
|
||||||
|
/// Validation set ratio (0.0-1.0)
|
||||||
|
final double validation;
|
||||||
|
/// Test set ratio (0.0-1.0)
|
||||||
|
final double test;
|
||||||
|
/// Random seed for reproducibility
|
||||||
|
final int? seed;
|
||||||
|
|
||||||
|
const DatasetSplit({
|
||||||
|
this.train = 0.8,
|
||||||
|
this.validation = 0.1,
|
||||||
|
this.test = 0.1,
|
||||||
|
this.seed,
|
||||||
|
});
|
||||||
|
|
||||||
|
Map<String, dynamic> toJson() => {
|
||||||
|
'train': train,
|
||||||
|
'validation': validation,
|
||||||
|
'test': test,
|
||||||
|
if (seed != null) 'seed': seed,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Dataset preprocessing options.
|
||||||
|
class DatasetPreprocessing {
|
||||||
|
/// Maximum sequence length (for text)
|
||||||
|
final int? maxLength;
|
||||||
|
/// Truncation strategy
|
||||||
|
final String? truncation;
|
||||||
|
/// Tokenizer to use
|
||||||
|
final String? tokenizer;
|
||||||
|
/// Image size (for vision datasets)
|
||||||
|
final List<int>? imageSize;
|
||||||
|
/// Normalize images
|
||||||
|
final bool? normalizeImages;
|
||||||
|
/// Shuffle dataset
|
||||||
|
final bool shuffle;
|
||||||
|
/// Remove duplicates
|
||||||
|
final bool deduplicate;
|
||||||
|
|
||||||
|
const DatasetPreprocessing({
|
||||||
|
this.maxLength,
|
||||||
|
this.truncation,
|
||||||
|
this.tokenizer,
|
||||||
|
this.imageSize,
|
||||||
|
this.normalizeImages,
|
||||||
|
this.shuffle = true,
|
||||||
|
this.deduplicate = false,
|
||||||
|
});
|
||||||
|
|
||||||
|
Map<String, dynamic> toJson() => {
|
||||||
|
if (maxLength != null) 'max_length': maxLength,
|
||||||
|
if (truncation != null) 'truncation': truncation,
|
||||||
|
if (tokenizer != null) 'tokenizer': tokenizer,
|
||||||
|
if (imageSize != null) 'image_size': imageSize,
|
||||||
|
if (normalizeImages != null) 'normalize_images': normalizeImages,
|
||||||
|
'shuffle': shuffle,
|
||||||
|
'deduplicate': deduplicate,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Dataset upload result.
|
||||||
|
class DatasetUploadResult {
|
||||||
|
/// Dataset CID (use this for training)
|
||||||
|
final String cid;
|
||||||
|
/// Dataset ID
|
||||||
|
final String datasetId;
|
||||||
|
/// Total samples
|
||||||
|
final int totalSamples;
|
||||||
|
/// Train samples
|
||||||
|
final int trainSamples;
|
||||||
|
/// Validation samples
|
||||||
|
final int validationSamples;
|
||||||
|
/// Test samples
|
||||||
|
final int testSamples;
|
||||||
|
/// Size in bytes
|
||||||
|
final int sizeBytes;
|
||||||
|
/// Schema detected
|
||||||
|
final Map<String, String>? schema;
|
||||||
|
|
||||||
|
const DatasetUploadResult({
|
||||||
|
required this.cid,
|
||||||
|
required this.datasetId,
|
||||||
|
required this.totalSamples,
|
||||||
|
required this.trainSamples,
|
||||||
|
required this.validationSamples,
|
||||||
|
required this.testSamples,
|
||||||
|
required this.sizeBytes,
|
||||||
|
this.schema,
|
||||||
|
});
|
||||||
|
|
||||||
|
factory DatasetUploadResult.fromJson(Map<String, dynamic> json) =>
|
||||||
|
DatasetUploadResult(
|
||||||
|
cid: json['cid'] as String,
|
||||||
|
datasetId: json['dataset_id'] as String,
|
||||||
|
totalSamples: json['total_samples'] as int,
|
||||||
|
trainSamples: json['train_samples'] as int,
|
||||||
|
validationSamples: json['validation_samples'] as int,
|
||||||
|
testSamples: json['test_samples'] as int,
|
||||||
|
sizeBytes: json['size_bytes'] as int,
|
||||||
|
schema: (json['schema'] as Map<String, dynamic>?)?.cast<String, String>(),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Dataset info.
|
||||||
|
class DatasetInfo {
|
||||||
|
final String id;
|
||||||
|
final String name;
|
||||||
|
final String? description;
|
||||||
|
final String cid;
|
||||||
|
final DatasetFormat format;
|
||||||
|
final DatasetType type;
|
||||||
|
final int totalSamples;
|
||||||
|
final int sizeBytes;
|
||||||
|
final Map<String, String>? schema;
|
||||||
|
final bool isPublic;
|
||||||
|
final DateTime createdAt;
|
||||||
|
|
||||||
|
const DatasetInfo({
|
||||||
|
required this.id,
|
||||||
|
required this.name,
|
||||||
|
this.description,
|
||||||
|
required this.cid,
|
||||||
|
required this.format,
|
||||||
|
required this.type,
|
||||||
|
required this.totalSamples,
|
||||||
|
required this.sizeBytes,
|
||||||
|
this.schema,
|
||||||
|
required this.isPublic,
|
||||||
|
required this.createdAt,
|
||||||
|
});
|
||||||
|
|
||||||
|
factory DatasetInfo.fromJson(Map<String, dynamic> json) => DatasetInfo(
|
||||||
|
id: json['id'] as String,
|
||||||
|
name: json['name'] as String,
|
||||||
|
description: json['description'] as String?,
|
||||||
|
cid: json['cid'] as String,
|
||||||
|
format: DatasetFormat.fromString(json['format'] as String),
|
||||||
|
type: DatasetType.fromString(json['type'] as String),
|
||||||
|
totalSamples: json['total_samples'] as int,
|
||||||
|
sizeBytes: json['size_bytes'] as int,
|
||||||
|
schema: (json['schema'] as Map<String, dynamic>?)?.cast<String, String>(),
|
||||||
|
isPublic: json['is_public'] as bool? ?? false,
|
||||||
|
createdAt: DateTime.parse(json['created_at'] as String),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
/// Training progress update.
|
/// Training progress update.
|
||||||
class TrainingProgress {
|
class TrainingProgress {
|
||||||
final String jobId;
|
final String jobId;
|
||||||
|
|
|
||||||
|
|
@ -94,7 +94,15 @@ export 'src/types.dart'
|
||||||
// Training types
|
// Training types
|
||||||
TrainingOptions,
|
TrainingOptions,
|
||||||
TrainingResult,
|
TrainingResult,
|
||||||
TrainingProgress;
|
TrainingProgress,
|
||||||
|
// Dataset types
|
||||||
|
DatasetFormat,
|
||||||
|
DatasetType,
|
||||||
|
DatasetUploadOptions,
|
||||||
|
DatasetSplit,
|
||||||
|
DatasetPreprocessing,
|
||||||
|
DatasetUploadResult,
|
||||||
|
DatasetInfo;
|
||||||
|
|
||||||
export 'src/tensor.dart' show Tensor;
|
export 'src/tensor.dart' show Tensor;
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue