feat(sdk/flutter): add dataset upload APIs and comprehensive examples
Add comprehensive dataset management to the Flutter SDK including: - Dataset formats: JSONL, CSV, Parquet, Arrow, HuggingFace, TFRecord, WebDataset, Text, ImageFolder, Custom - Dataset types: text completion, instruction tuning, chat, Q&A, classification, NER, vision, audio - Upload methods: uploadDataset, uploadDatasetFromFile, createDatasetFromRecords - Management APIs: listDatasets, getDataset, deleteDataset - Dataset preprocessing: splitting, shuffling, deduplication, tokenization - Complete examples showing all formats and use cases
This commit is contained in:
parent
89fc542da4
commit
cb071a7a3b
4 changed files with 604 additions and 25 deletions
|
|
@ -1,3 +1,4 @@
|
|||
import 'dart:convert';
|
||||
import 'dart:io';
|
||||
|
||||
import 'package:synor_compute/synor_compute.dart';
|
||||
|
|
@ -41,6 +42,9 @@ void main() async {
|
|||
|
||||
// Example 8: Custom model upload
|
||||
await customModelExample(client);
|
||||
|
||||
// Example 9: Dataset upload formats
|
||||
await datasetUploadExamples(client);
|
||||
} finally {
|
||||
// Always dispose client to release resources
|
||||
client.dispose();
|
||||
|
|
@ -237,16 +241,46 @@ Future<void> modelRegistryExample(SynorCompute client) async {
|
|||
Future<void> trainingExample(SynorCompute client) async {
|
||||
print('=== Model Training ===');
|
||||
|
||||
// Example: Fine-tune Llama 3 8B on custom dataset
|
||||
print('Fine-tuning llama-3-8b on custom dataset...');
|
||||
// ========== STEP 1: Upload your dataset ==========
|
||||
print('Step 1: Uploading training dataset...\n');
|
||||
|
||||
// Note: In practice, you'd upload your dataset first:
|
||||
// final datasetCid = await client.uploadTensor(datasetTensor);
|
||||
// Example 1: JSONL format (most common for LLM fine-tuning)
|
||||
final jsonlData = '''
|
||||
{"prompt": "What is the capital of France?", "completion": "Paris"}
|
||||
{"prompt": "Translate 'hello' to Spanish", "completion": "hola"}
|
||||
{"prompt": "What is 2 + 2?", "completion": "4"}
|
||||
{"prompt": "Who wrote Romeo and Juliet?", "completion": "William Shakespeare"}
|
||||
''';
|
||||
|
||||
final dataset = await client.uploadDataset(
|
||||
utf8.encode(jsonlData),
|
||||
DatasetUploadOptions(
|
||||
name: 'qa-training-data',
|
||||
description: 'Question-answering training dataset',
|
||||
format: DatasetFormat.jsonl,
|
||||
type: DatasetType.textCompletion,
|
||||
split: DatasetSplit(train: 0.8, validation: 0.1, test: 0.1, seed: 42),
|
||||
preprocessing: DatasetPreprocessing(
|
||||
maxLength: 2048,
|
||||
shuffle: true,
|
||||
deduplicate: true,
|
||||
),
|
||||
),
|
||||
);
|
||||
|
||||
print('Dataset uploaded!');
|
||||
print(' CID: ${dataset.cid}');
|
||||
print(' Total samples: ${dataset.totalSamples}');
|
||||
print(' Train/Val/Test: ${dataset.trainSamples}/${dataset.validationSamples}/${dataset.testSamples}');
|
||||
print(' Schema: ${dataset.schema}');
|
||||
|
||||
// ========== STEP 2: Fine-tune the model ==========
|
||||
print('\nStep 2: Fine-tuning llama-3-8b on dataset...\n');
|
||||
|
||||
final result = await client.fineTune(
|
||||
baseModel: 'llama-3-8b', // Use model alias
|
||||
datasetCid: 'QmYourDatasetCID', // Your uploaded dataset
|
||||
outputAlias: 'my-custom-llama', // Optional: alias for trained model
|
||||
baseModel: 'llama-3-8b',
|
||||
datasetCid: dataset.cid, // Use the CID from upload
|
||||
outputAlias: 'my-qa-model',
|
||||
options: TrainingOptions(
|
||||
framework: MlFramework.pytorch,
|
||||
epochs: 3,
|
||||
|
|
@ -258,7 +292,7 @@ Future<void> trainingExample(SynorCompute client) async {
|
|||
'warmup_steps': 100,
|
||||
'gradient_accumulation_steps': 4,
|
||||
},
|
||||
checkpointEvery: 500, // Save checkpoint every 500 steps
|
||||
checkpointEvery: 500,
|
||||
processor: ProcessorType.gpu,
|
||||
priority: Priority.high,
|
||||
),
|
||||
|
|
@ -271,13 +305,12 @@ Future<void> trainingExample(SynorCompute client) async {
|
|||
print(' Final loss: ${training.finalLoss.toStringAsFixed(4)}');
|
||||
print(' Duration: ${training.durationMs / 1000}s');
|
||||
print(' Cost: \$${training.cost.toStringAsFixed(4)}');
|
||||
print(' Metrics: ${training.metrics}');
|
||||
|
||||
// Now use your trained model for inference
|
||||
print('\nUsing trained model for inference:');
|
||||
// ========== STEP 3: Use your trained model ==========
|
||||
print('\nStep 3: Testing trained model...\n');
|
||||
final inference = await client.inference(
|
||||
training.modelCid, // Use the CID of your trained model
|
||||
'Hello, how are you?',
|
||||
training.modelCid,
|
||||
'What is the capital of Germany?',
|
||||
options: InferenceOptions(maxTokens: 50),
|
||||
);
|
||||
print('Response: ${inference.result}');
|
||||
|
|
@ -286,19 +319,143 @@ Future<void> trainingExample(SynorCompute client) async {
|
|||
}
|
||||
|
||||
print('');
|
||||
}
|
||||
|
||||
// Example: Streaming training progress
|
||||
print('Training with streaming progress...');
|
||||
await for (final progress in client.trainStream(
|
||||
modelCid: 'llama-3-8b',
|
||||
datasetCid: 'QmYourDatasetCID',
|
||||
options: TrainingOptions(epochs: 1, batchSize: 16),
|
||||
)) {
|
||||
// Update UI with progress
|
||||
stdout.write('\r${progress.progressText} - '
|
||||
'${progress.samplesPerSecond} samples/s');
|
||||
/// Dataset upload examples - shows all supported formats
|
||||
Future<void> datasetUploadExamples(SynorCompute client) async {
|
||||
print('=== Dataset Upload Examples ===\n');
|
||||
|
||||
// ========== FORMAT 1: JSONL (JSON Lines) ==========
|
||||
print('Format 1: JSONL - One JSON object per line');
|
||||
print('''
|
||||
// Text completion format
|
||||
{"prompt": "Hello", "completion": "Hi there!"}
|
||||
|
||||
// Instruction tuning format
|
||||
{"instruction": "Summarize", "input": "Long text...", "output": "Summary"}
|
||||
|
||||
// Chat format
|
||||
{"messages": [{"role": "user", "content": "Hi"}, {"role": "assistant", "content": "Hello!"}]}
|
||||
''');
|
||||
|
||||
// Example: Instruction tuning dataset
|
||||
final instructionData = await client.createDatasetFromRecords(
|
||||
name: 'instruction-dataset',
|
||||
records: [
|
||||
{
|
||||
'instruction': 'Summarize the following text',
|
||||
'input': 'The quick brown fox jumps over the lazy dog.',
|
||||
'output': 'A fox jumps over a dog.'
|
||||
},
|
||||
{
|
||||
'instruction': 'Translate to French',
|
||||
'input': 'Hello world',
|
||||
'output': 'Bonjour le monde'
|
||||
},
|
||||
],
|
||||
type: DatasetType.instructionTuning,
|
||||
);
|
||||
print('Instruction dataset CID: ${instructionData.cid}');
|
||||
|
||||
// ========== FORMAT 2: CSV ==========
|
||||
print('\nFormat 2: CSV - Comma-separated values with headers');
|
||||
print('''
|
||||
prompt,completion
|
||||
"What is AI?","Artificial Intelligence is..."
|
||||
"Define ML","Machine Learning is..."
|
||||
''');
|
||||
|
||||
final csvData = '''
|
||||
prompt,completion
|
||||
"What is AI?","Artificial Intelligence is the simulation of human intelligence"
|
||||
"Define ML","Machine Learning is a subset of AI that learns from data"
|
||||
''';
|
||||
|
||||
final csvDataset = await client.uploadDataset(
|
||||
utf8.encode(csvData),
|
||||
DatasetUploadOptions(
|
||||
name: 'csv-dataset',
|
||||
format: DatasetFormat.csv,
|
||||
type: DatasetType.textCompletion,
|
||||
columnMapping: {'prompt': 'input', 'completion': 'output'},
|
||||
),
|
||||
);
|
||||
print('CSV dataset CID: ${csvDataset.cid}');
|
||||
|
||||
// ========== FORMAT 3: Parquet (for large datasets) ==========
|
||||
print('\nFormat 3: Parquet - Efficient columnar format for large datasets');
|
||||
print(' - Best for datasets > 1GB');
|
||||
print(' - Supports compression');
|
||||
print(' - Fast random access');
|
||||
print('''
|
||||
final parquetDataset = await client.uploadDatasetFromFile(
|
||||
'/path/to/dataset.parquet',
|
||||
DatasetUploadOptions(
|
||||
name: 'large-dataset',
|
||||
format: DatasetFormat.parquet,
|
||||
type: DatasetType.textCompletion,
|
||||
),
|
||||
);
|
||||
''');
|
||||
|
||||
// ========== FORMAT 4: HuggingFace ==========
|
||||
print('\nFormat 4: HuggingFace datasets format');
|
||||
print(' - Compatible with datasets library');
|
||||
print(' - Automatic schema detection');
|
||||
|
||||
// ========== FORMAT 5: Image folder ==========
|
||||
print('\nFormat 5: Image folder structure');
|
||||
print('''
|
||||
dataset/
|
||||
├── train/
|
||||
│ ├── cat/
|
||||
│ │ ├── img001.jpg
|
||||
│ │ └── img002.jpg
|
||||
│ └── dog/
|
||||
│ ├── img001.jpg
|
||||
│ └── img002.jpg
|
||||
└── val/
|
||||
├── cat/
|
||||
└── dog/
|
||||
''');
|
||||
|
||||
// ========== ALL SUPPORTED FORMATS ==========
|
||||
print('\nAll supported dataset formats:');
|
||||
for (final format in DatasetFormat.values) {
|
||||
final description = switch (format) {
|
||||
DatasetFormat.jsonl => 'JSON Lines - one JSON per line (recommended for text)',
|
||||
DatasetFormat.csv => 'CSV - comma-separated with headers',
|
||||
DatasetFormat.parquet => 'Parquet - columnar format for large datasets',
|
||||
DatasetFormat.arrow => 'Apache Arrow - in-memory format',
|
||||
DatasetFormat.huggingface => 'HuggingFace datasets format',
|
||||
DatasetFormat.tfrecord => 'TFRecord - TensorFlow format',
|
||||
DatasetFormat.webdataset => 'WebDataset - PyTorch streaming format',
|
||||
DatasetFormat.text => 'Plain text - one sample per line',
|
||||
DatasetFormat.imagefolder => 'Image folder structure',
|
||||
DatasetFormat.custom => 'Custom binary format',
|
||||
};
|
||||
print(' ${format.value.padRight(15)} - $description');
|
||||
}
|
||||
|
||||
// ========== ALL DATASET TYPES ==========
|
||||
print('\nAll supported dataset types:');
|
||||
for (final type in DatasetType.values) {
|
||||
final description = switch (type) {
|
||||
DatasetType.textCompletion => 'prompt → completion pairs',
|
||||
DatasetType.instructionTuning => 'instruction + input → output',
|
||||
DatasetType.chat => 'multi-turn conversations',
|
||||
DatasetType.questionAnswering => 'question → answer pairs',
|
||||
DatasetType.textClassification => 'text → label',
|
||||
DatasetType.ner => 'named entity recognition',
|
||||
DatasetType.imageClassification => 'image → label',
|
||||
DatasetType.objectDetection => 'image → bounding boxes',
|
||||
DatasetType.imageSegmentation => 'image → mask',
|
||||
DatasetType.imageText => 'image-text pairs (CLIP, etc.)',
|
||||
DatasetType.audioTranscription => 'audio → text',
|
||||
DatasetType.custom => 'custom format',
|
||||
};
|
||||
print(' ${type.value.padRight(22)} - $description');
|
||||
}
|
||||
print('\nTraining complete!');
|
||||
|
||||
print('');
|
||||
}
|
||||
|
|
|
|||
|
|
@ -501,6 +501,164 @@ class SynorCompute {
|
|||
await _delete('/models/$modelId');
|
||||
}
|
||||
|
||||
// ==================== Dataset Management ====================
|
||||
|
||||
/// Upload a dataset for training.
|
||||
///
|
||||
/// Supports multiple formats: JSONL, CSV, Parquet, Arrow, HuggingFace, etc.
|
||||
///
|
||||
/// Example (JSONL format):
|
||||
/// ```dart
|
||||
/// // Create JSONL dataset
|
||||
/// final jsonlData = '''
|
||||
/// {"prompt": "What is 2+2?", "completion": "4"}
|
||||
/// {"prompt": "Capital of France?", "completion": "Paris"}
|
||||
/// {"prompt": "Hello", "completion": "Hi there!"}
|
||||
/// ''';
|
||||
///
|
||||
/// final dataset = await client.uploadDataset(
|
||||
/// utf8.encode(jsonlData),
|
||||
/// DatasetUploadOptions(
|
||||
/// name: 'my-qa-dataset',
|
||||
/// format: DatasetFormat.jsonl,
|
||||
/// type: DatasetType.textCompletion,
|
||||
/// split: DatasetSplit(train: 0.8, validation: 0.1, test: 0.1),
|
||||
/// ),
|
||||
/// );
|
||||
/// print('Dataset CID: ${dataset.cid}');
|
||||
/// print('Total samples: ${dataset.totalSamples}');
|
||||
/// ```
|
||||
Future<DatasetUploadResult> uploadDataset(
|
||||
List<int> data,
|
||||
DatasetUploadOptions options,
|
||||
) async {
|
||||
_checkDisposed();
|
||||
|
||||
final uri = Uri.parse('${_config.baseUrl}/datasets/upload');
|
||||
final request = http.MultipartRequest('POST', uri)
|
||||
..headers.addAll(_headers)
|
||||
..fields.addAll(options.toJson().map((k, v) {
|
||||
if (v is Map || v is List) {
|
||||
return MapEntry(k, jsonEncode(v));
|
||||
}
|
||||
return MapEntry(k, v.toString());
|
||||
}))
|
||||
..files.add(http.MultipartFile.fromBytes(
|
||||
'dataset',
|
||||
data,
|
||||
filename: '${options.name}.${options.format.value}',
|
||||
));
|
||||
|
||||
final streamedResponse = await _httpClient.send(request);
|
||||
final response = await http.Response.fromStream(streamedResponse);
|
||||
|
||||
if (response.statusCode != 200) {
|
||||
throw SynorException(
|
||||
'Dataset upload failed',
|
||||
statusCode: response.statusCode,
|
||||
);
|
||||
}
|
||||
|
||||
final json = jsonDecode(response.body) as Map<String, dynamic>;
|
||||
return DatasetUploadResult.fromJson(json);
|
||||
}
|
||||
|
||||
/// Upload a dataset from a file path.
|
||||
Future<DatasetUploadResult> uploadDatasetFromFile(
|
||||
String filePath,
|
||||
DatasetUploadOptions options,
|
||||
) async {
|
||||
_checkDisposed();
|
||||
|
||||
final uri = Uri.parse('${_config.baseUrl}/datasets/upload');
|
||||
final request = http.MultipartRequest('POST', uri)
|
||||
..headers.addAll(_headers)
|
||||
..fields.addAll(options.toJson().map((k, v) {
|
||||
if (v is Map || v is List) {
|
||||
return MapEntry(k, jsonEncode(v));
|
||||
}
|
||||
return MapEntry(k, v.toString());
|
||||
}))
|
||||
..files.add(await http.MultipartFile.fromPath('dataset', filePath));
|
||||
|
||||
final streamedResponse = await _httpClient.send(request);
|
||||
final response = await http.Response.fromStream(streamedResponse);
|
||||
|
||||
if (response.statusCode != 200) {
|
||||
throw SynorException(
|
||||
'Dataset upload failed',
|
||||
statusCode: response.statusCode,
|
||||
);
|
||||
}
|
||||
|
||||
final json = jsonDecode(response.body) as Map<String, dynamic>;
|
||||
return DatasetUploadResult.fromJson(json);
|
||||
}
|
||||
|
||||
/// List uploaded datasets.
|
||||
Future<List<DatasetInfo>> listDatasets({DatasetType? type}) async {
|
||||
_checkDisposed();
|
||||
|
||||
final params = <String, String>{
|
||||
if (type != null) 'type': type.value,
|
||||
};
|
||||
|
||||
final response = await _get('/datasets', params);
|
||||
final datasets = response['datasets'] as List;
|
||||
return datasets
|
||||
.map((d) => DatasetInfo.fromJson(d as Map<String, dynamic>))
|
||||
.toList();
|
||||
}
|
||||
|
||||
/// Get dataset info by ID or CID.
|
||||
Future<DatasetInfo> getDataset(String datasetId) async {
|
||||
_checkDisposed();
|
||||
|
||||
final response = await _get('/datasets/$datasetId');
|
||||
return DatasetInfo.fromJson(response);
|
||||
}
|
||||
|
||||
/// Delete a dataset.
|
||||
Future<void> deleteDataset(String datasetId) async {
|
||||
_checkDisposed();
|
||||
|
||||
await _delete('/datasets/$datasetId');
|
||||
}
|
||||
|
||||
/// Create a dataset from inline data (convenience method).
|
||||
///
|
||||
/// Example (instruction tuning):
|
||||
/// ```dart
|
||||
/// final dataset = await client.createDatasetFromRecords(
|
||||
/// name: 'instruction-dataset',
|
||||
/// records: [
|
||||
/// {'instruction': 'Summarize:', 'input': 'Long text...', 'output': 'Summary'},
|
||||
/// {'instruction': 'Translate:', 'input': 'Hello', 'output': 'Hola'},
|
||||
/// ],
|
||||
/// type: DatasetType.instructionTuning,
|
||||
/// );
|
||||
/// ```
|
||||
Future<DatasetUploadResult> createDatasetFromRecords({
|
||||
required String name,
|
||||
required List<Map<String, dynamic>> records,
|
||||
DatasetType type = DatasetType.textCompletion,
|
||||
DatasetSplit? split,
|
||||
}) async {
|
||||
// Convert to JSONL format
|
||||
final jsonlLines = records.map((r) => jsonEncode(r)).join('\n');
|
||||
final data = utf8.encode(jsonlLines);
|
||||
|
||||
return uploadDataset(
|
||||
data,
|
||||
DatasetUploadOptions(
|
||||
name: name,
|
||||
format: DatasetFormat.jsonl,
|
||||
type: type,
|
||||
split: split,
|
||||
),
|
||||
);
|
||||
}
|
||||
|
||||
// ==================== Training ====================
|
||||
|
||||
/// Train a model on a dataset.
|
||||
|
|
|
|||
|
|
@ -595,6 +595,262 @@ class ModelUploadResult {
|
|||
);
|
||||
}
|
||||
|
||||
/// Dataset format for training.
|
||||
enum DatasetFormat {
|
||||
/// JSON Lines format - one JSON object per line
|
||||
jsonl('jsonl'),
|
||||
/// CSV format with headers
|
||||
csv('csv'),
|
||||
/// Parquet columnar format (efficient for large datasets)
|
||||
parquet('parquet'),
|
||||
/// Apache Arrow format
|
||||
arrow('arrow'),
|
||||
/// HuggingFace datasets format
|
||||
huggingface('huggingface'),
|
||||
/// TFRecord format (TensorFlow)
|
||||
tfrecord('tfrecord'),
|
||||
/// WebDataset format (PyTorch)
|
||||
webdataset('webdataset'),
|
||||
/// Raw text files (one sample per line)
|
||||
text('text'),
|
||||
/// Image folder structure
|
||||
imagefolder('imagefolder'),
|
||||
/// Custom binary format
|
||||
custom('custom');
|
||||
|
||||
const DatasetFormat(this.value);
|
||||
final String value;
|
||||
|
||||
static DatasetFormat fromString(String s) =>
|
||||
DatasetFormat.values.firstWhere((f) => f.value == s, orElse: () => jsonl);
|
||||
}
|
||||
|
||||
/// Dataset type/task.
|
||||
enum DatasetType {
|
||||
/// Text completion (prompt → completion)
|
||||
textCompletion('text_completion'),
|
||||
/// Instruction following (instruction, input, output)
|
||||
instructionTuning('instruction_tuning'),
|
||||
/// Chat/conversation format
|
||||
chat('chat'),
|
||||
/// Question answering
|
||||
questionAnswering('question_answering'),
|
||||
/// Text classification
|
||||
textClassification('text_classification'),
|
||||
/// Named entity recognition
|
||||
ner('ner'),
|
||||
/// Image classification
|
||||
imageClassification('image_classification'),
|
||||
/// Object detection
|
||||
objectDetection('object_detection'),
|
||||
/// Image segmentation
|
||||
imageSegmentation('image_segmentation'),
|
||||
/// Image-text pairs
|
||||
imageText('image_text'),
|
||||
/// Audio transcription
|
||||
audioTranscription('audio_transcription'),
|
||||
/// Custom format
|
||||
custom('custom');
|
||||
|
||||
const DatasetType(this.value);
|
||||
final String value;
|
||||
|
||||
static DatasetType fromString(String s) =>
|
||||
DatasetType.values.firstWhere((t) => t.value == s, orElse: () => custom);
|
||||
}
|
||||
|
||||
/// Dataset upload options.
|
||||
class DatasetUploadOptions {
|
||||
/// Dataset name
|
||||
final String name;
|
||||
/// Description
|
||||
final String? description;
|
||||
/// Dataset format
|
||||
final DatasetFormat format;
|
||||
/// Dataset type/task
|
||||
final DatasetType type;
|
||||
/// Column mapping (for CSV/Parquet)
|
||||
final Map<String, String>? columnMapping;
|
||||
/// Train/validation/test split ratios
|
||||
final DatasetSplit? split;
|
||||
/// Preprocessing options
|
||||
final DatasetPreprocessing? preprocessing;
|
||||
/// Is public
|
||||
final bool isPublic;
|
||||
|
||||
const DatasetUploadOptions({
|
||||
required this.name,
|
||||
this.description,
|
||||
this.format = DatasetFormat.jsonl,
|
||||
this.type = DatasetType.textCompletion,
|
||||
this.columnMapping,
|
||||
this.split,
|
||||
this.preprocessing,
|
||||
this.isPublic = false,
|
||||
});
|
||||
|
||||
Map<String, dynamic> toJson() => {
|
||||
'name': name,
|
||||
if (description != null) 'description': description,
|
||||
'format': format.value,
|
||||
'type': type.value,
|
||||
if (columnMapping != null) 'column_mapping': columnMapping,
|
||||
if (split != null) 'split': split!.toJson(),
|
||||
if (preprocessing != null) 'preprocessing': preprocessing!.toJson(),
|
||||
'is_public': isPublic,
|
||||
};
|
||||
}
|
||||
|
||||
/// Dataset split configuration.
|
||||
class DatasetSplit {
|
||||
/// Training set ratio (0.0-1.0)
|
||||
final double train;
|
||||
/// Validation set ratio (0.0-1.0)
|
||||
final double validation;
|
||||
/// Test set ratio (0.0-1.0)
|
||||
final double test;
|
||||
/// Random seed for reproducibility
|
||||
final int? seed;
|
||||
|
||||
const DatasetSplit({
|
||||
this.train = 0.8,
|
||||
this.validation = 0.1,
|
||||
this.test = 0.1,
|
||||
this.seed,
|
||||
});
|
||||
|
||||
Map<String, dynamic> toJson() => {
|
||||
'train': train,
|
||||
'validation': validation,
|
||||
'test': test,
|
||||
if (seed != null) 'seed': seed,
|
||||
};
|
||||
}
|
||||
|
||||
/// Dataset preprocessing options.
|
||||
class DatasetPreprocessing {
|
||||
/// Maximum sequence length (for text)
|
||||
final int? maxLength;
|
||||
/// Truncation strategy
|
||||
final String? truncation;
|
||||
/// Tokenizer to use
|
||||
final String? tokenizer;
|
||||
/// Image size (for vision datasets)
|
||||
final List<int>? imageSize;
|
||||
/// Normalize images
|
||||
final bool? normalizeImages;
|
||||
/// Shuffle dataset
|
||||
final bool shuffle;
|
||||
/// Remove duplicates
|
||||
final bool deduplicate;
|
||||
|
||||
const DatasetPreprocessing({
|
||||
this.maxLength,
|
||||
this.truncation,
|
||||
this.tokenizer,
|
||||
this.imageSize,
|
||||
this.normalizeImages,
|
||||
this.shuffle = true,
|
||||
this.deduplicate = false,
|
||||
});
|
||||
|
||||
Map<String, dynamic> toJson() => {
|
||||
if (maxLength != null) 'max_length': maxLength,
|
||||
if (truncation != null) 'truncation': truncation,
|
||||
if (tokenizer != null) 'tokenizer': tokenizer,
|
||||
if (imageSize != null) 'image_size': imageSize,
|
||||
if (normalizeImages != null) 'normalize_images': normalizeImages,
|
||||
'shuffle': shuffle,
|
||||
'deduplicate': deduplicate,
|
||||
};
|
||||
}
|
||||
|
||||
/// Dataset upload result.
|
||||
class DatasetUploadResult {
|
||||
/// Dataset CID (use this for training)
|
||||
final String cid;
|
||||
/// Dataset ID
|
||||
final String datasetId;
|
||||
/// Total samples
|
||||
final int totalSamples;
|
||||
/// Train samples
|
||||
final int trainSamples;
|
||||
/// Validation samples
|
||||
final int validationSamples;
|
||||
/// Test samples
|
||||
final int testSamples;
|
||||
/// Size in bytes
|
||||
final int sizeBytes;
|
||||
/// Schema detected
|
||||
final Map<String, String>? schema;
|
||||
|
||||
const DatasetUploadResult({
|
||||
required this.cid,
|
||||
required this.datasetId,
|
||||
required this.totalSamples,
|
||||
required this.trainSamples,
|
||||
required this.validationSamples,
|
||||
required this.testSamples,
|
||||
required this.sizeBytes,
|
||||
this.schema,
|
||||
});
|
||||
|
||||
factory DatasetUploadResult.fromJson(Map<String, dynamic> json) =>
|
||||
DatasetUploadResult(
|
||||
cid: json['cid'] as String,
|
||||
datasetId: json['dataset_id'] as String,
|
||||
totalSamples: json['total_samples'] as int,
|
||||
trainSamples: json['train_samples'] as int,
|
||||
validationSamples: json['validation_samples'] as int,
|
||||
testSamples: json['test_samples'] as int,
|
||||
sizeBytes: json['size_bytes'] as int,
|
||||
schema: (json['schema'] as Map<String, dynamic>?)?.cast<String, String>(),
|
||||
);
|
||||
}
|
||||
|
||||
/// Dataset info.
|
||||
class DatasetInfo {
|
||||
final String id;
|
||||
final String name;
|
||||
final String? description;
|
||||
final String cid;
|
||||
final DatasetFormat format;
|
||||
final DatasetType type;
|
||||
final int totalSamples;
|
||||
final int sizeBytes;
|
||||
final Map<String, String>? schema;
|
||||
final bool isPublic;
|
||||
final DateTime createdAt;
|
||||
|
||||
const DatasetInfo({
|
||||
required this.id,
|
||||
required this.name,
|
||||
this.description,
|
||||
required this.cid,
|
||||
required this.format,
|
||||
required this.type,
|
||||
required this.totalSamples,
|
||||
required this.sizeBytes,
|
||||
this.schema,
|
||||
required this.isPublic,
|
||||
required this.createdAt,
|
||||
});
|
||||
|
||||
factory DatasetInfo.fromJson(Map<String, dynamic> json) => DatasetInfo(
|
||||
id: json['id'] as String,
|
||||
name: json['name'] as String,
|
||||
description: json['description'] as String?,
|
||||
cid: json['cid'] as String,
|
||||
format: DatasetFormat.fromString(json['format'] as String),
|
||||
type: DatasetType.fromString(json['type'] as String),
|
||||
totalSamples: json['total_samples'] as int,
|
||||
sizeBytes: json['size_bytes'] as int,
|
||||
schema: (json['schema'] as Map<String, dynamic>?)?.cast<String, String>(),
|
||||
isPublic: json['is_public'] as bool? ?? false,
|
||||
createdAt: DateTime.parse(json['created_at'] as String),
|
||||
);
|
||||
}
|
||||
|
||||
/// Training progress update.
|
||||
class TrainingProgress {
|
||||
final String jobId;
|
||||
|
|
|
|||
|
|
@ -94,7 +94,15 @@ export 'src/types.dart'
|
|||
// Training types
|
||||
TrainingOptions,
|
||||
TrainingResult,
|
||||
TrainingProgress;
|
||||
TrainingProgress,
|
||||
// Dataset types
|
||||
DatasetFormat,
|
||||
DatasetType,
|
||||
DatasetUploadOptions,
|
||||
DatasetSplit,
|
||||
DatasetPreprocessing,
|
||||
DatasetUploadResult,
|
||||
DatasetInfo;
|
||||
|
||||
export 'src/tensor.dart' show Tensor;
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue