Input Types Compatibility with OpenAI's API (#112) (#214)

OlivierDehaene · nlaanait · web-flow · commit a1dd76dfecfa · 2024-03-22T16:40:56.000+01:00
Co-authored-by: Numan Laanait &lt;nlaanait@gmail.com&gt;
diff --git a/core/src/tokenization.rs b/core/src/tokenization.rs
@@ -2,7 +2,7 @@
 use crate::TextEmbeddingsError;
 use tokenizers::tokenizer::Tokenizer;
 pub use tokenizers::Encoding as RawEncoding;
-use tokenizers::{EncodeInput, TruncationDirection, TruncationParams, TruncationStrategy};
+use tokenizers::{TruncationDirection, TruncationParams, TruncationStrategy};
 use tokio::sync::{mpsc, oneshot};
 use tracing::{instrument, Span};
 
@@ -222,14 +222,25 @@ fn tokenize_input(
     truncate_params: Option<TruncationParams>,
     tokenizer: &mut Tokenizer,
 ) -> Result<RawEncoding, TextEmbeddingsError> {
-    let inputs: EncodeInput = match inputs {
-        EncodingInput::Single(s) => s.into(),
-        EncodingInput::Dual(s1, s2) => (s1, s2).into(),
+    let encoding = match inputs {
+        // encode input
+        EncodingInput::Single(s) => tokenizer
+            .with_truncation(truncate_params)?
+            .encode::<String>(s, add_special_tokens)?,
+        EncodingInput::Dual(s1, s2) => {
+            tokenizer
+                .with_truncation(truncate_params)?
+                .encode::<(String, String)>((s1, s2), add_special_tokens)?
+        }
+        // input is encoded -> convert to tokenizers Encoding
+        EncodingInput::Ids(ids) => {
+            let text = tokenizer.decode(&ids, false)?;
+            tokenizer
+                .with_truncation(truncate_params)?
+                .encode::<String>(text, false)?
+        }
     };
-
-    Ok(tokenizer
-        .with_truncation(truncate_params)?
-        .encode(inputs, add_special_tokens)?)
+    Ok(encoding)
 }
 
 /// Get input length and optionally truncate it
@@ -256,9 +267,7 @@ fn encode_input(
             "`inputs` must have less than {max_input_length} tokens. Given: {seq_len}"
         )));
     }
-
     metrics::histogram!("te_request_input_length", seq_len as f64);
-
     Ok(ValidEncoding {
         input_ids: encoding.get_ids().to_vec(),
         token_type_ids: encoding.get_type_ids().to_vec(),
@@ -278,13 +287,15 @@ pub struct ValidEncoding {
 pub enum EncodingInput {
     Single(String),
     Dual(String, String),
+    Ids(Vec<u32>),
 }
 
 impl EncodingInput {
     fn is_empty(&self) -> bool {
         match self {
             EncodingInput::Single(s) => s.is_empty(),
             EncodingInput::Dual(s1, s2) => s1.is_empty() && s2.is_empty(),
+            EncodingInput::Ids(v) => v.is_empty(),
         }
     }
 }
diff --git a/router/src/http/server.rs b/router/src/http/server.rs
@@ -1,11 +1,11 @@
 /// HTTP Server logic
 use crate::http::types::{
     DecodeRequest, DecodeResponse, EmbedAllRequest, EmbedAllResponse, EmbedRequest, EmbedResponse,
-    EmbedSparseRequest, EmbedSparseResponse, Input, InputIds, OpenAICompatEmbedding,
+    EmbedSparseRequest, EmbedSparseResponse, Input, InputIds, InputType, OpenAICompatEmbedding,
     OpenAICompatErrorResponse, OpenAICompatRequest, OpenAICompatResponse, OpenAICompatUsage,
     PredictInput, PredictRequest, PredictResponse, Prediction, Rank, RerankRequest, RerankResponse,
-    Sequence, SimpleToken, SparseValue, TokenizeRequest, TokenizeResponse, VertexPrediction,
-    VertexRequest, VertexResponse,
+    Sequence, SimpleToken, SparseValue, TokenizeInput, TokenizeRequest, TokenizeResponse,
+    VertexPrediction, VertexRequest, VertexResponse,
 };
 use crate::{
     shutdown, ClassifierModel, EmbeddingModel, ErrorResponse, ErrorType, Info, ModelType,
@@ -474,7 +474,7 @@ async fn embed(
         Input::Single(input) => {
             metrics::increment_counter!("te_request_count", "method" => "single");
 
-            let compute_chars = input.chars().count();
+            let compute_chars = input.count_chars();
 
             let permit = infer.try_acquire_permit().map_err(ErrorResponse::from)?;
             let response = infer
@@ -529,7 +529,7 @@ async fn embed(
             let mut compute_chars = 0;
 
             for input in inputs {
-                compute_chars += input.chars().count();
+                compute_chars += input.count_chars();
 
                 let local_infer = infer.clone();
                 futures.push(async move {
@@ -630,7 +630,7 @@ async fn embed_sparse(
         Input::Single(input) => {
             metrics::increment_counter!("te_request_count", "method" => "single");
 
-            let compute_chars = input.chars().count();
+            let compute_chars = input.count_chars();
 
             let permit = infer.try_acquire_permit().map_err(ErrorResponse::from)?;
             let response = infer
@@ -685,7 +685,7 @@ async fn embed_sparse(
             let mut compute_chars = 0;
 
             for input in inputs {
-                compute_chars += input.chars().count();
+                compute_chars += input.count_chars();
 
                 let local_infer = infer.clone();
                 futures.push(async move {
@@ -778,7 +778,7 @@ async fn embed_all(
         Input::Single(input) => {
             metrics::increment_counter!("te_request_count", "method" => "single");
 
-            let compute_chars = input.chars().count();
+            let compute_chars = input.count_chars();
 
             let permit = infer.try_acquire_permit().map_err(ErrorResponse::from)?;
             let response = infer
@@ -833,7 +833,7 @@ async fn embed_all(
             let mut compute_chars = 0;
 
             for input in inputs {
-                compute_chars += input.chars().count();
+                compute_chars += input.count_chars();
 
                 let local_infer = infer.clone();
                 futures.push(async move {
@@ -892,7 +892,7 @@ async fn embed_all(
 #[utoipa::path(
 post,
 tag = "Text Embeddings Inference",
-path = "/embeddings",
+path = "/v1/embeddings",
 request_body = OpenAICompatRequest,
 responses(
 (status = 200, description = "Embeddings", body = OpenAICompatResponse),
@@ -923,7 +923,7 @@ async fn openai_embed(
         Input::Single(input) => {
             metrics::increment_counter!("te_request_count", "method" => "single");
 
-            let compute_chars = input.chars().count();
+            let compute_chars = input.count_chars();
 
             let permit = infer.try_acquire_permit().map_err(ErrorResponse::from)?;
             let response = infer
@@ -982,7 +982,7 @@ async fn openai_embed(
             let mut compute_chars = 0;
 
             for input in inputs {
-                compute_chars += input.chars().count();
+                compute_chars += input.count_chars();
 
                 let local_infer = infer.clone();
                 futures.push(async move {
@@ -1107,8 +1107,10 @@ async fn tokenize(
     };
 
     let tokens = match req.inputs {
-        Input::Single(input) => vec![tokenize_inner(input, req.add_special_tokens, infer.0).await?],
-        Input::Batch(inputs) => {
+        TokenizeInput::Single(input) => {
+            vec![tokenize_inner(input, req.add_special_tokens, infer.0).await?]
+        }
+        TokenizeInput::Batch(inputs) => {
             if inputs.is_empty() {
                 let message = "`inputs` cannot be empty".to_string();
                 tracing::error!("{message}");
@@ -1369,9 +1371,11 @@ pub async fn run(
     EmbedResponse,
     ErrorResponse,
     OpenAICompatErrorResponse,
+    TokenizeInput,
     TokenizeRequest,
     TokenizeResponse,
     SimpleToken,
+    InputType,
     InputIds,
     DecodeRequest,
     DecodeResponse,
@@ -1448,6 +1452,7 @@ pub async fn run(
         .route("/decode", post(decode))
         // OpenAI compat route
         .route("/embeddings", post(openai_embed))
+        .route("/v1/embeddings", post(openai_embed))
         // Vertex compat route
         .route("/vertex", post(vertex_compatibility))
         // Base Health route
diff --git a/router/src/http/types.rs b/router/src/http/types.rs
@@ -250,11 +250,33 @@ pub(crate) struct Rank {
 #[derive(Serialize, ToSchema)]
 pub(crate) struct RerankResponse(pub Vec<Rank>);
 
+#[derive(Deserialize, ToSchema, Debug)]
+#[serde(untagged)]
+pub(crate) enum InputType {
+    String(String),
+    Ids(Vec<u32>),
+}
+impl InputType {
+    pub(crate) fn count_chars(&self) -> usize {
+        match self {
+            InputType::String(s) => s.chars().count(),
+            InputType::Ids(v) => v.len(),
+        }
+    }
+}
+impl From<InputType> for EncodingInput {
+    fn from(value: InputType) -> Self {
+        match value {
+            InputType::String(s) => Self::Single(s),
+            InputType::Ids(v) => Self::Ids(v),
+        }
+    }
+}
 #[derive(Deserialize, ToSchema)]
 #[serde(untagged)]
 pub(crate) enum Input {
-    Single(String),
-    Batch(Vec<String>),
+    Single(InputType),
+    Batch(Vec<InputType>),
 }
 
 #[derive(Deserialize, ToSchema)]
@@ -352,9 +374,16 @@ pub(crate) struct OpenAICompatErrorResponse {
     pub error_type: ErrorType,
 }
 
+#[derive(Deserialize, ToSchema)]
+#[serde(untagged)]
+pub(crate) enum TokenizeInput {
+    Single(String),
+    Batch(Vec<String>),
+}
+
 #[derive(Deserialize, ToSchema)]
 pub(crate) struct TokenizeRequest {
-    pub inputs: Input,
+    pub inputs: TokenizeInput,
     #[serde(default = "default_add_special_tokens")]
     #[schema(default = "true", example = "true")]
     pub add_special_tokens: bool,
diff --git a/router/tests/test_http_embed.rs b/router/tests/test_http_embed.rs
@@ -19,7 +19,6 @@ async fn test_embeddings() -> Result<()> {
     let request = json!({
         "inputs": "test"
     });
-
     let client = reqwest::Client::new();
     let res = client
         .post("http://0.0.0.0:8090/embed")
@@ -31,6 +30,18 @@ async fn test_embeddings() -> Result<()> {
     let matcher = YamlMatcher::<Vec<Vec<Score>>>::new();
     insta::assert_yaml_snapshot!("embeddings_single", embeddings_single, &matcher);
 
+    let test_tokens = vec![[101, 3231, 102]]; // tokenized "test"
+    let request = json!({"inputs": &test_tokens});
+    let res = client
+        .post("http://0.0.0.0:8090/embed")
+        .json(&request)
+        .send()
+        .await?;
+
+    let embeddings_single = res.json::<Vec<Vec<Score>>>().await?;
+    let matcher = YamlMatcher::<Vec<Vec<Score>>>::new();
+    insta::assert_yaml_snapshot!("embeddings_single", embeddings_single, &matcher);
+
     let request = json!({
         "inputs": vec!["test", "test", "test", "test", "test"],
     });
@@ -41,10 +52,22 @@ async fn test_embeddings() -> Result<()> {
         .json(&request)
         .send()
         .await?;
-
     let embeddings_batch = res.json::<Vec<Vec<Score>>>().await?;
     insta::assert_yaml_snapshot!("embeddings_batch", embeddings_batch, &matcher);
+    for embeddings in &embeddings_batch {
+        assert_eq!(embeddings, &embeddings_single[0]);
+    }
 
+    let request =
+        json!({"inputs": &test_tokens.repeat(request["inputs"].as_array().unwrap().len())});
+    let res = client
+        .post("http://0.0.0.0:8090/embed")
+        .json(&request)
+        .send()
+        .await?;
+
+    let embeddings_batch = res.json::<Vec<Vec<Score>>>().await?;
+    insta::assert_yaml_snapshot!("embeddings_batch", embeddings_batch, &matcher);
     for embeddings in &embeddings_batch {
         assert_eq!(embeddings, &embeddings_single[0]);
     }