Skip to content

Commit 317edcf

Browse files
authored
Merge branch 'main' into it2t-patch
2 parents ab1e2ec + 4fb92f5 commit 317edcf

File tree

122 files changed

+2497
-1157
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

122 files changed

+2497
-1157
lines changed

CODEOWNERS

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Ownership for the Inference Package
22

3-
/packages/inference/ @vvmnnnkv @radames
3+
/packages/inference/ @julien-c @hanouticelina @SBrandeis @coyotte508
44

55
# Ownership for the Tasks Package
66

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ You can run our packages with vanilla JS, without any bundler, by using a CDN or
9595

9696
```html
9797
<script type="module">
98-
import { HfInference } from 'https://cdn.jsdelivr.net/npm/@huggingface/inference@3.0.0/+esm';
98+
import { HfInference } from 'https://cdn.jsdelivr.net/npm/@huggingface/inference@3.1.2/+esm';
9999
import { createRepo, commit, deleteRepo, listFiles } from "https://cdn.jsdelivr.net/npm/@huggingface/[email protected]/+esm";
100100
</script>
101101
```

packages/agents/pnpm-lock.yaml

Lines changed: 12 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

packages/inference/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# 🤗 Hugging Face Inference Endpoints
1+
# 🤗 Hugging Face Inference
22

33
A Typescript powered wrapper for the Hugging Face Inference API (serverless), Inference Endpoints (dedicated), and third-party Inference Providers.
44
It works with [Inference API (serverless)](https://huggingface.co/docs/api-inference/index) and [Inference Endpoints (dedicated)](https://huggingface.co/docs/inference-endpoints/index), and even with supported third-party Inference Providers.

packages/inference/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "@huggingface/inference",
3-
"version": "3.0.0",
3+
"version": "3.1.2",
44
"packageManager": "[email protected]",
55
"license": "MIT",
66
"author": "Tim Mikeladze <[email protected]>",

packages/inference/src/providers/fal-ai.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,4 +20,8 @@ export const FAL_AI_SUPPORTED_MODEL_IDS: ProviderMapping<FalAiId> = {
2020
"automatic-speech-recognition": {
2121
"openai/whisper-large-v3": "fal-ai/whisper",
2222
},
23+
"text-to-video": {
24+
"genmo/mochi-1-preview": "fal-ai/mochi-v1",
25+
"tencent/HunyuanVideo": "fal-ai/hunyuan-video",
26+
},
2327
};

packages/inference/src/providers/replicate.ts

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,24 @@ type ReplicateId = string;
66

77
export const REPLICATE_SUPPORTED_MODEL_IDS: ProviderMapping<ReplicateId> = {
88
"text-to-image": {
9+
"black-forest-labs/FLUX.1-dev": "black-forest-labs/flux-dev",
910
"black-forest-labs/FLUX.1-schnell": "black-forest-labs/flux-schnell",
11+
"ByteDance/Hyper-SD":
12+
"bytedance/hyper-flux-16step:382cf8959fb0f0d665b26e7e80b8d6dc3faaef1510f14ce017e8c732bb3d1eb7",
1013
"ByteDance/SDXL-Lightning":
1114
"bytedance/sdxl-lightning-4step:5599ed30703defd1d160a25a63321b4dec97101d98b4674bcc56e41f62f35637",
15+
"playgroundai/playground-v2.5-1024px-aesthetic":
16+
"playgroundai/playground-v2.5-1024px-aesthetic:a45f82a1382bed5c7aeb861dac7c7d191b0fdf74d8d57c4a0e6ed7d4d0bf7d24",
17+
"stabilityai/stable-diffusion-3.5-large-turbo": "stability-ai/stable-diffusion-3.5-large-turbo",
18+
"stabilityai/stable-diffusion-3.5-large": "stability-ai/stable-diffusion-3.5-large",
19+
"stabilityai/stable-diffusion-3.5-medium": "stability-ai/stable-diffusion-3.5-medium",
20+
"stabilityai/stable-diffusion-xl-base-1.0":
21+
"stability-ai/sdxl:7762fd07cf82c948538e41f63f77d685e02b063e37e496e96eefd46c929f9bdc",
1222
},
1323
"text-to-speech": {
1424
"OuteAI/OuteTTS-0.3-500M": "jbilcke/oute-tts:39a59319327b27327fa3095149c5a746e7f2aee18c75055c3368237a6503cd26",
1525
},
26+
"text-to-video": {
27+
"genmo/mochi-1-preview": "genmoai/mochi-1:1944af04d098ef69bed7f9d335d102e652203f268ec4aaa2d836f6217217e460",
28+
},
1629
};

packages/inference/src/providers/together.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ export const TOGETHER_SUPPORTED_MODEL_IDS: ProviderMapping<TogetherId> = {
2121
},
2222
conversational: {
2323
"databricks/dbrx-instruct": "databricks/dbrx-instruct",
24+
"deepseek-ai/DeepSeek-R1": "deepseek-ai/DeepSeek-R1",
25+
"deepseek-ai/DeepSeek-V3": "deepseek-ai/DeepSeek-V3",
2426
"deepseek-ai/deepseek-llm-67b-chat": "deepseek-ai/deepseek-llm-67b-chat",
2527
"google/gemma-2-9b-it": "google/gemma-2-9b-it",
2628
"google/gemma-2b-it": "google/gemma-2-27b-it",

packages/inference/src/tasks/audio/audioClassification.ts

Lines changed: 7 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,11 @@
1+
import type { AudioClassificationInput, AudioClassificationOutput } from "@huggingface/tasks";
12
import { InferenceOutputError } from "../../lib/InferenceOutputError";
23
import type { BaseArgs, Options } from "../../types";
34
import { request } from "../custom/request";
5+
import type { LegacyAudioInput } from "./utils";
6+
import { preparePayload } from "./utils";
47

5-
export type AudioClassificationArgs = BaseArgs & {
6-
/**
7-
* Binary audio data
8-
*/
9-
data: Blob | ArrayBuffer;
10-
};
11-
12-
export interface AudioClassificationOutputValue {
13-
/**
14-
* The label for the class (model specific)
15-
*/
16-
label: string;
17-
18-
/**
19-
* A float that represents how likely it is that the audio file belongs to this class.
20-
*/
21-
score: number;
22-
}
23-
24-
export type AudioClassificationReturn = AudioClassificationOutputValue[];
8+
export type AudioClassificationArgs = BaseArgs & (AudioClassificationInput | LegacyAudioInput);
259

2610
/**
2711
* This task reads some audio input and outputs the likelihood of classes.
@@ -30,8 +14,9 @@ export type AudioClassificationReturn = AudioClassificationOutputValue[];
3014
export async function audioClassification(
3115
args: AudioClassificationArgs,
3216
options?: Options
33-
): Promise<AudioClassificationReturn> {
34-
const res = await request<AudioClassificationReturn>(args, {
17+
): Promise<AudioClassificationOutput> {
18+
const payload = preparePayload(args);
19+
const res = await request<AudioClassificationOutput>(payload, {
3520
...options,
3621
taskHint: "audio-classification",
3722
});
Lines changed: 43 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,19 @@
11
import { InferenceOutputError } from "../../lib/InferenceOutputError";
22
import type { BaseArgs, Options } from "../../types";
33
import { request } from "../custom/request";
4+
import type { LegacyAudioInput } from "./utils";
5+
import { preparePayload } from "./utils";
46

5-
export type AudioToAudioArgs = BaseArgs & {
6-
/**
7-
* Binary audio data
8-
*/
9-
data: Blob | ArrayBuffer;
10-
};
7+
export type AudioToAudioArgs =
8+
| (BaseArgs & {
9+
/**
10+
* Binary audio data
11+
*/
12+
inputs: Blob;
13+
})
14+
| LegacyAudioInput;
1115

12-
export interface AudioToAudioOutputValue {
16+
export interface AudioToAudioOutputElem {
1317
/**
1418
* The label for the audio output (model specific)
1519
*/
@@ -18,32 +22,48 @@ export interface AudioToAudioOutputValue {
1822
/**
1923
* Base64 encoded audio output.
2024
*/
21-
blob: string;
25+
audio: Blob;
26+
}
2227

23-
/**
24-
* Content-type for blob, e.g. audio/flac
25-
*/
28+
export interface AudioToAudioOutput {
29+
blob: string;
2630
"content-type": string;
31+
label: string;
2732
}
2833

29-
export type AudioToAudioReturn = AudioToAudioOutputValue[];
30-
3134
/**
3235
* This task reads some audio input and outputs one or multiple audio files.
3336
* Example model: speechbrain/sepformer-wham does audio source separation.
3437
*/
35-
export async function audioToAudio(args: AudioToAudioArgs, options?: Options): Promise<AudioToAudioReturn> {
36-
const res = await request<AudioToAudioReturn>(args, {
38+
export async function audioToAudio(args: AudioToAudioArgs, options?: Options): Promise<AudioToAudioOutput[]> {
39+
const payload = preparePayload(args);
40+
const res = await request<AudioToAudioOutput>(payload, {
3741
...options,
3842
taskHint: "audio-to-audio",
3943
});
40-
const isValidOutput =
41-
Array.isArray(res) &&
42-
res.every(
43-
(x) => typeof x.label === "string" && typeof x.blob === "string" && typeof x["content-type"] === "string"
44-
);
45-
if (!isValidOutput) {
46-
throw new InferenceOutputError("Expected Array<{label: string, blob: string, content-type: string}>");
44+
45+
return validateOutput(res);
46+
}
47+
48+
function validateOutput(output: unknown): AudioToAudioOutput[] {
49+
if (!Array.isArray(output)) {
50+
throw new InferenceOutputError("Expected Array");
51+
}
52+
if (
53+
!output.every((elem): elem is AudioToAudioOutput => {
54+
return (
55+
typeof elem === "object" &&
56+
elem &&
57+
"label" in elem &&
58+
typeof elem.label === "string" &&
59+
"content-type" in elem &&
60+
typeof elem["content-type"] === "string" &&
61+
"blob" in elem &&
62+
typeof elem.blob === "string"
63+
);
64+
})
65+
) {
66+
throw new InferenceOutputError("Expected Array<{label: string, audio: Blob}>");
4767
}
48-
return res;
68+
return output;
4969
}
Lines changed: 35 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,13 @@
1+
import type { AutomaticSpeechRecognitionInput, AutomaticSpeechRecognitionOutput } from "@huggingface/tasks";
12
import { InferenceOutputError } from "../../lib/InferenceOutputError";
23
import type { BaseArgs, Options, RequestArgs } from "../../types";
34
import { base64FromBytes } from "../../utils/base64FromBytes";
45
import { request } from "../custom/request";
6+
import type { LegacyAudioInput } from "./utils";
7+
import { preparePayload } from "./utils";
8+
import { omit } from "../../utils/omit";
59

6-
export type AutomaticSpeechRecognitionArgs = BaseArgs & {
7-
/**
8-
* Binary audio data
9-
*/
10-
data: Blob | ArrayBuffer;
11-
};
12-
13-
export interface AutomaticSpeechRecognitionOutput {
14-
/**
15-
* The text that was recognized from the audio
16-
*/
17-
text: string;
18-
}
19-
10+
export type AutomaticSpeechRecognitionArgs = BaseArgs & (AutomaticSpeechRecognitionInput | LegacyAudioInput);
2011
/**
2112
* This task reads some audio input and outputs the said words within the audio files.
2213
* Recommended model (english language): facebook/wav2vec2-large-960h-lv60-self
@@ -25,15 +16,8 @@ export async function automaticSpeechRecognition(
2516
args: AutomaticSpeechRecognitionArgs,
2617
options?: Options
2718
): Promise<AutomaticSpeechRecognitionOutput> {
28-
if (args.provider === "fal-ai") {
29-
const contentType = args.data instanceof Blob ? args.data.type : "audio/mpeg";
30-
const base64audio = base64FromBytes(
31-
new Uint8Array(args.data instanceof ArrayBuffer ? args.data : await args.data.arrayBuffer())
32-
);
33-
(args as RequestArgs & { audio_url: string }).audio_url = `data:${contentType};base64,${base64audio}`;
34-
delete (args as RequestArgs & { data: unknown }).data;
35-
}
36-
const res = await request<AutomaticSpeechRecognitionOutput>(args, {
19+
const payload = await buildPayload(args);
20+
const res = await request<AutomaticSpeechRecognitionOutput>(payload, {
3721
...options,
3822
taskHint: "automatic-speech-recognition",
3923
});
@@ -43,3 +27,31 @@ export async function automaticSpeechRecognition(
4327
}
4428
return res;
4529
}
30+
31+
const FAL_AI_SUPPORTED_BLOB_TYPES = ["audio/mpeg", "audio/mp4", "audio/wav", "audio/x-wav"];
32+
33+
async function buildPayload(args: AutomaticSpeechRecognitionArgs): Promise<RequestArgs> {
34+
if (args.provider === "fal-ai") {
35+
const blob = "data" in args && args.data instanceof Blob ? args.data : "inputs" in args ? args.inputs : undefined;
36+
const contentType = blob?.type;
37+
if (!contentType) {
38+
throw new Error(
39+
`Unable to determine the input's content-type. Make sure your are passing a Blob when using provider fal-ai.`
40+
);
41+
}
42+
if (!FAL_AI_SUPPORTED_BLOB_TYPES.includes(contentType)) {
43+
throw new Error(
44+
`Provider fal-ai does not support blob type ${contentType} - supported content types are: ${FAL_AI_SUPPORTED_BLOB_TYPES.join(
45+
", "
46+
)}`
47+
);
48+
}
49+
const base64audio = base64FromBytes(new Uint8Array(await blob.arrayBuffer()));
50+
return {
51+
...("data" in args ? omit(args, "data") : omit(args, "inputs")),
52+
audio_url: `data:${contentType};base64,${base64audio}`,
53+
};
54+
} else {
55+
return preparePayload(args);
56+
}
57+
}
Lines changed: 8 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,25 @@
1+
import type { TextToSpeechInput } from "@huggingface/tasks";
12
import { InferenceOutputError } from "../../lib/InferenceOutputError";
23
import type { BaseArgs, Options } from "../../types";
34
import { request } from "../custom/request";
45

5-
export type TextToSpeechArgs = BaseArgs & {
6-
/**
7-
* The text to generate an audio from
8-
*/
9-
inputs: string;
10-
};
6+
type TextToSpeechArgs = BaseArgs & TextToSpeechInput;
117

12-
export type TextToSpeechOutput = Blob;
138
interface OutputUrlTextToSpeechGeneration {
149
output: string | string[];
1510
}
1611
/**
1712
* This task synthesize an audio of a voice pronouncing a given text.
1813
* Recommended model: espnet/kan-bayashi_ljspeech_vits
1914
*/
20-
export async function textToSpeech(args: TextToSpeechArgs, options?: Options): Promise<TextToSpeechOutput> {
21-
const res = await request<TextToSpeechOutput | OutputUrlTextToSpeechGeneration>(args, {
15+
export async function textToSpeech(args: TextToSpeechArgs, options?: Options): Promise<Blob> {
16+
const res = await request<Blob | OutputUrlTextToSpeechGeneration>(args, {
2217
...options,
2318
taskHint: "text-to-speech",
2419
});
20+
if (res instanceof Blob) {
21+
return res;
22+
}
2523
if (res && typeof res === "object") {
2624
if ("output" in res) {
2725
if (typeof res.output === "string") {
@@ -35,9 +33,5 @@ export async function textToSpeech(args: TextToSpeechArgs, options?: Options): P
3533
}
3634
}
3735
}
38-
const isValidOutput = res && res instanceof Blob;
39-
if (!isValidOutput) {
40-
throw new InferenceOutputError("Expected Blob");
41-
}
42-
return res;
36+
throw new InferenceOutputError("Expected Blob or object with output");
4337
}
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
import type { BaseArgs, RequestArgs } from "../../types";
2+
import { omit } from "../../utils/omit";
3+
4+
/**
5+
* @deprecated
6+
*/
7+
export interface LegacyAudioInput {
8+
data: Blob | ArrayBuffer;
9+
}
10+
11+
export function preparePayload(args: BaseArgs & ({ inputs: Blob } | LegacyAudioInput)): RequestArgs {
12+
return "data" in args
13+
? args
14+
: {
15+
...omit(args, "inputs"),
16+
data: args.inputs,
17+
};
18+
}

0 commit comments

Comments
 (0)