]> OzVa Git service - ozva-cloud/commitdiff
feat: guess plain text encoding then set content-type charset (#186)
authorsigoden <sigoden@gmail.com>
Wed, 1 Mar 2023 01:36:59 +0000 (09:36 +0800)
committerGitHub <noreply@github.com>
Wed, 1 Mar 2023 01:36:59 +0000 (09:36 +0800)
Cargo.lock
Cargo.toml
src/server.rs
tests/fixtures.rs
tests/http.rs

index 456285a61665184fbbe7c55a9301d5976ebb9468..49fb386507808c5c09b27df2a070a5d59430f8a4 100644 (file)
@@ -204,6 +204,17 @@ version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 
+[[package]]
+name = "chardetng"
+version = "0.1.17"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "14b8f0b65b7b08ae3c8187e8d77174de20cb6777864c6b832d8ad365999cf1ea"
+dependencies = [
+ "cfg-if",
+ "encoding_rs",
+ "memchr",
+]
+
 [[package]]
 name = "chrono"
 version = "0.4.23"
@@ -425,6 +436,7 @@ dependencies = [
  "async-stream",
  "async_zip",
  "base64 0.21.0",
+ "chardetng",
  "chrono",
  "clap",
  "clap_complete",
index 068ecd91880e5d392c81c10b99283bf9089bdef8..8a7a68c6b86030252e910e7d74e09a461d0175bd 100644 (file)
@@ -42,6 +42,7 @@ form_urlencoded = "1.0"
 alphanumeric-sort = "1.4"
 content_inspector = "0.2"
 anyhow = "1.0"
+chardetng = "0.1"
 
 [features]
 default = ["tls"]
index feb2f3dd87db66ef2ca3dd02ab780996acb657fe..d026d8c743dd331507f264036677bc92ad945071 100644 (file)
@@ -638,14 +638,10 @@ impl Server {
             None
         };
 
-        if let Some(mime) = mime_guess::from_path(path).first() {
-            res.headers_mut().typed_insert(ContentType::from(mime));
-        } else {
-            res.headers_mut().insert(
-                CONTENT_TYPE,
-                HeaderValue::from_static("application/octet-stream"),
-            );
-        }
+        res.headers_mut().insert(
+            CONTENT_TYPE,
+            HeaderValue::from_str(&get_content_type(path).await?)?,
+        );
 
         let filename = try_get_file_name(path)?;
         res.headers_mut().insert(
@@ -1382,3 +1378,34 @@ fn set_webdav_headers(res: &mut Response) {
     res.headers_mut()
         .insert("DAV", HeaderValue::from_static("1,2"));
 }
+
+async fn get_content_type(path: &Path) -> Result<String> {
+    let mut buffer: Vec<u8> = vec![];
+    fs::File::open(path)
+        .await?
+        .take(1024)
+        .read_to_end(&mut buffer)
+        .await?;
+    let mime = mime_guess::from_path(path).first();
+    let is_text = content_inspector::inspect(&buffer).is_text();
+    let content_type = if is_text {
+        let mut detector = chardetng::EncodingDetector::new();
+        detector.feed(&buffer, buffer.len() < 1024);
+        let (enc, confident) = detector.guess_assess(None, true);
+        let charset = if confident {
+            format!("; charset={}", enc.name())
+        } else {
+            "".into()
+        };
+        match mime {
+            Some(m) => format!("{m}{charset}"),
+            None => format!("text/plain{charset}"),
+        }
+    } else {
+        match mime {
+            Some(m) => m.to_string(),
+            None => "application/octet-stream".into(),
+        }
+    };
+    Ok(content_type)
+}
index b78567015c0f4eaec8a0f9959968209c4a13ef3b..b855ba65cdf7dca82d8b8b43074b1c5561345653 100644 (file)
@@ -46,15 +46,12 @@ pub fn tmpdir() -> TempDir {
     let tmpdir = assert_fs::TempDir::new().expect("Couldn't create a temp dir for tests");
     for file in FILES {
         if *file == BIN_FILE {
-            tmpdir
-                .child(file)
-                .write_binary(b"bin\0\0123")
-                .expect("Couldn't write to file");
+            tmpdir.child(file).write_binary(b"bin\0\0123").unwrap();
         } else {
             tmpdir
                 .child(file)
                 .write_str(&format!("This is {file}"))
-                .expect("Couldn't write to file");
+                .unwrap();
         }
     }
     for directory in DIRECTORIES {
@@ -62,7 +59,7 @@ pub fn tmpdir() -> TempDir {
             tmpdir
                 .child(format!("{}{}", directory, "index.html"))
                 .write_str("__ASSERTS_PREFIX__index.js;DATA = __INDEX_DATA__")
-                .expect("Couldn't write to file");
+                .unwrap();
         } else {
             for file in FILES {
                 if *directory == DIR_NO_INDEX && *file == "index.html" {
@@ -72,17 +69,37 @@ pub fn tmpdir() -> TempDir {
                     tmpdir
                         .child(format!("{directory}{file}"))
                         .write_binary(b"bin\0\0123")
-                        .expect("Couldn't write to file");
+                        .unwrap();
                 } else {
                     tmpdir
                         .child(format!("{directory}{file}"))
                         .write_str(&format!("This is {directory}{file}"))
-                        .expect("Couldn't write to file");
+                        .unwrap();
                 }
             }
         }
     }
     tmpdir.child("dir4/hidden").touch().unwrap();
+    tmpdir
+        .child("content-types/bin.tar")
+        .write_binary(b"\x7f\x45\x4c\x46\x02\x01\x00\x00")
+        .unwrap();
+    tmpdir
+        .child("content-types/bin")
+        .write_binary(b"\x7f\x45\x4c\x46\x02\x01\x00\x00")
+        .unwrap();
+    tmpdir
+        .child("content-types/file-utf8.txt")
+        .write_str("世界")
+        .unwrap();
+    tmpdir
+        .child("content-types/file-gbk.txt")
+        .write_binary(b"\xca\xc0\xbd\xe7")
+        .unwrap();
+    tmpdir
+        .child("content-types/file")
+        .write_str("世界")
+        .unwrap();
 
     tmpdir
 }
index ee9ff6beafd1604dc76cf8c8a08a0bdc37b1d470..6ae77909cf51b2cdcf1e1ba488bcaebf02581e65 100644 (file)
@@ -148,7 +148,10 @@ fn empty_search(#[with(&["-A"])] server: TestServer) -> Result<(), Error> {
 fn get_file(server: TestServer) -> Result<(), Error> {
     let resp = reqwest::blocking::get(format!("{}index.html", server.url()))?;
     assert_eq!(resp.status(), 200);
-    assert_eq!(resp.headers().get("content-type").unwrap(), "text/html");
+    assert_eq!(
+        resp.headers().get("content-type").unwrap(),
+        "text/html; charset=UTF-8"
+    );
     assert_eq!(resp.headers().get("accept-ranges").unwrap(), "bytes");
     assert!(resp.headers().contains_key("etag"));
     assert!(resp.headers().contains_key("last-modified"));
@@ -161,7 +164,10 @@ fn get_file(server: TestServer) -> Result<(), Error> {
 fn head_file(server: TestServer) -> Result<(), Error> {
     let resp = fetch!(b"HEAD", format!("{}index.html", server.url())).send()?;
     assert_eq!(resp.status(), 200);
-    assert_eq!(resp.headers().get("content-type").unwrap(), "text/html");
+    assert_eq!(
+        resp.headers().get("content-type").unwrap(),
+        "text/html; charset=UTF-8"
+    );
     assert_eq!(resp.headers().get("accept-ranges").unwrap(), "bytes");
     assert!(resp.headers().contains_key("content-disposition"));
     assert!(resp.headers().contains_key("etag"));
@@ -259,3 +265,33 @@ fn delete_file_404(#[with(&["-A"])] server: TestServer) -> Result<(), Error> {
     assert_eq!(resp.status(), 404);
     Ok(())
 }
+
+#[rstest]
+fn get_file_content_type(server: TestServer) -> Result<(), Error> {
+    let resp = reqwest::blocking::get(format!("{}content-types/bin.tar", server.url()))?;
+    assert_eq!(
+        resp.headers().get("content-type").unwrap(),
+        "application/x-tar"
+    );
+    let resp = reqwest::blocking::get(format!("{}content-types/bin", server.url()))?;
+    assert_eq!(
+        resp.headers().get("content-type").unwrap(),
+        "application/octet-stream"
+    );
+    let resp = reqwest::blocking::get(format!("{}content-types/file-utf8.txt", server.url()))?;
+    assert_eq!(
+        resp.headers().get("content-type").unwrap(),
+        "text/plain; charset=UTF-8"
+    );
+    let resp = reqwest::blocking::get(format!("{}content-types/file-gbk.txt", server.url()))?;
+    assert_eq!(
+        resp.headers().get("content-type").unwrap(),
+        "text/plain; charset=GBK"
+    );
+    let resp = reqwest::blocking::get(format!("{}content-types/file", server.url()))?;
+    assert_eq!(
+        resp.headers().get("content-type").unwrap(),
+        "text/plain; charset=UTF-8"
+    );
+    Ok(())
+}