Principle Chapter II - Upload large file in parts
Principle Chapter II: Multipart upload of large files
When users upload files to BOS via a browser, large files must first be split into parts before uploading. During the upload process, issues such as page closure, browser crashes, or network interruptions may occur, resulting in upload failure. BOS supports multipart upload and resumable upload features. For details about multipart upload, please refer to “Multipart Upload of Objects.” Here, we will introduce the implementation method of “resumable upload.”
Implementation Principle
Using multipart file upload (multipartUpload), BOS assigns an uploadId to the upload process. The file is then divided into several parts, each uploaded independently. Once all parts are uploaded, the BOS service generates an eTag for each part. When all parts are uploaded, the BOS service uses these eTags and the uploadId to locate the correct parts and combine them back into the original file.
In this method, BOS does not require all parts to be uploaded at once; they can be uploaded incrementally. This means that if the page is accidentally closed during the upload process, there's no need to restart the upload from the beginning—you simply re-upload the parts that were not successfully uploaded. However, you must save the uploadId from the upload and the eTags of the uploaded parts (though it is recommended to query detailed information about uploaded parts using the listParts API). Before uploading a part, you can check if it has already been uploaded. If it was uploaded successfully, you can skip uploading that part.
The uploadId must be stored in a way that is unaffected by page closure. A good approach is to save it in localStorage.
Local Storage
When saving the uploadId, you must assign a key to distinguish between different files and upload tasks. In this example, the key is formed by combining the file name, file size, partition size, bucket name, and object name.
1var generateLocalKey = function (blob, chunkSize, bucket, object) {
2 return [blob.name, blob.size, chunkSize, bucket, object].join('&');
3 };
Note: This key generation method is not entirely accurate. If you select two files with the same file name and size but different contents in two separate upload tasks, this method cannot distinguish between them. A more precise approach is to calculate the MD5 value of both the file name and content and use it as the key.
We choose localStorage as the storage method:
1var getUploadId = function (key) {
2 return localStorage.getItem(key);
3 };
4var setUploadId = function (key, uploadId) {
5 return localStorage.setItem(key, uploadId);
6 };
7var removeUploadId = function (key) {
8 return localStorage.removeItem(key);
9 };
Initialize multipart upload
When initializing multipart upload, there are two possibilities:
- If the uploadId of this file already exists, skip the initiateMultipartUpload() method and call listParts() to obtain the uploaded part information;
- If no uploadId exists for the file, use the initiateMultipartUpload() method to obtain a new uploadId and then save it to localStorage.
1 // ... Omit the BosClient initialization process
2 // var bosClient = new BosClient(bosConfig);
3 var initiateMultipartUpload = function (file, chunkSize, bucket, object) {
4 // Generate the key for localStorage based on the file
5 var key = generateLocalKey(file, chunkSize, bucket, object);
6 // Get the corresponding `uploadId`
7 var uploadId = getUploadId(key);
8 if (uploadId) {
9 // If `uploadId` exists, it indicates that there is an unfinished multipart upload.
10 // Then call `listParts()` to get the uploaded part information.
11 return BosClient.listParts(bucket, object, uploadId)
12 .then(function (response) {
13 // response.body.parts contains information about uploaded parts
14 response.body.uploadId = uploadId;
15 return response;
16 });
17 }
18 else {
19 // If `uploadId` does not exist, initialize using the normal process
20 return BosClient.initiateMultipartUpload(bucket, object)
21 .then(function (response) {
22 // response.body.uploadId is the newly generated `uploadId`
23 response.body.parts = [];
24 // To enable resumable upload next time, we need to save the newly generated `uploadId`
25 setUploadId(key, response.body.uploadId);
26 return response;
27 });
28 }
29 }
Multipart upload
When dividing a large file into parts, compare it with the list of uploaded parts to determine if re-uploading is necessary.
1 function getEtag(partNumber, parts){
2 // Find the eTag of the part with a specific partNumber from the list of uploaded parts
3 for(var i = 0, l = parts.length; i < l; i++){
4 if (parts[i].partNumber === partNumber) {
5 return parts[i].eTag;
6 }
7 }
8 return null;
9 }
10 function getTasks (file, uploadId, chunkSize, bucket, object, parts) {
11 var leftSize = file.size;
12 var offset = 0;
13 var partNumber = 1;
14 var tasks = [];
15 while (leftSize > 0) {
16 var partSize = Math.min(leftSize, chunkSize);
17 var task = {
18 file: file,
19 uploadId: uploadId,
20 bucket: bucket,
21 object: object,
22 partNumber: partNumber,
23 partSize: partSize,
24 start: offset,
25 stop: offset + partSize - 1
26 };
27 // If the etag of this chunk is found in the list of uploaded parts, record it
28 var etag = getEtag(partNumber, parts);
29 if (etag){
30 task.etag = etag;
31 }
32 tasks.push(task);
33 leftSize -= partSize;
34 offset += partSize;
35 partNumber += 1;
36 }
37 return tasks;
38 }
39 When processing multipart upload, decide whether to upload based on whether it has the etag field:
40 function uploadPartFile(state, bosClient) {
41 return function(task, callback) {
42 if (task.etag) {
43 // If there is an etag field, skip the upload directly
44 callback(null, {
45 http_headers: {
46 etag: task.etag
47 },
48 body: {}
49 });
50 }
51 else {
52 // Otherwise, upload
53 var blob = task.file.slice(task.start, task.stop + 1);
54 bosClient.uploadPartFromBlob(task.bucketName, task.key, task.uploadId, task.partNumber, task.partSize, blob)
55 .then(function(res) {
56 ++state.loaded;
57 callbacknull(res);
58 })
59 .catch(function(err) {
60 callback(err);
61 });
62 }
63 };
64 }
Process code
We have made some minor modifications to the code of each step, but the code of the entire process is very similar to that of multipart upload:
1 var chunkSize = 5 * 1024 * 1024; // Part size
2 var uploadId;
3 initiateMultipartUpload(file, chunkSize, bucket, object)
4 .then(function(response) {
5 uploadId = response.body.uploadId; // uploadId, which may be just generated by the server or obtained from localStorage
6 var parts = response.body.parts || []; // List of uploaded parts. If it is a new upload, it is an empty array
7 var deferred = sdk.Q.defer();
8 var tasks = getTasks(blob, uploadId, chunkSize, bucket, key, parts);
9 var state = {
10 lengthComputable: true,
11 loaded: parts.length, // Number of uploaded parts
12 total: tasks.length
13 };
14 // If the number of uploaded parts is greater than 0, you can first modify the file upload progress
15 bosClient.emit('progress', state);
16 // To manage multipart uploads, the async library (https://github.com/caolan/async) is used for asynchronous processing
17 var THREADS = 2; // Number of parts uploaded simultaneously
18 async.mapLimit(tasks, THREADS, uploadPartFile(state, bosClient), function(err, results) {
19 if (err) {
20 deferred.reject(err);
21 } else {
22 deferred.resolve(results);
23 }
24 });
25 return deferred.promise;
26 })
27 .then(function(allResponse) {
28 var partList = [];
29 allResponse.forEach(function(response, index) {
30 // Generate the part list
31 partList.push({
32 partNumber: index + 1,
33 eTag: response.http_headers.etag
34 });
35 });
36 // After all parts are uploaded, the corresponding `uploadId` can be deleted
37 removeUploadId(key, uploadId);
38 return bosClient.completeMultipartUpload(bucket, key, uploadId, partList); // Complete the upload
39 })
40 .then(function (res) {
41 // Upload completed
42 })
43 .catch(function (err) {
44 // Upload failed, add your code
45 console.error(err);
46 });
