当前位置: 首页 > news >正文

国内快速高效下载 HuggingFace上的各种大语言模型

预先安装:

apt install aria2
# sudo apt install aria2apt install git-lfs
# sudo apt install git-lfs
  1. 下载hfd
wget https://hf-mirror.com/hfd/hfd.sh
chmod a+x hfd.sh
  1. 设置环境变量

Linux

export HF_ENDPOINT=https://hf-mirror.com

Windows

$env:HF_ENDPOINT = "https://hf-mirror.com"
  1. 下载模型
./hfd.sh gpt2 --tool aria2c -x 4
  1. 下载数据集
./hfd.sh wikitext --dataset --tool aria2c -x 4

本文参考:https://hf-mirror.com/

自己保存hfd脚本:hfd.sh

#!/usr/bin/env bash
# Color definitions
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
NC='\033[0m' # No Colortrap 'printf "${YELLOW}\nDownload interrupted. If you re-run the command, you can resume the download from the breakpoint.\n${NC}"; exit 1' INTdisplay_help() {cat << EOF
Usage:hfd <repo_id> [--include include_pattern] [--exclude exclude_pattern] [--hf_username username] [--hf_token token] [--tool aria2c|wget] [-x threads] [--dataset] [--local-dir path]    Description:Downloads a model or dataset from Hugging Face using the provided repo ID.Parameters:repo_id        The Hugging Face repo ID in the format 'org/repo_name'.--include       (Optional) Flag to specify a string pattern to include files for downloading.--exclude       (Optional) Flag to specify a string pattern to exclude files from downloading.include/exclude_pattern The pattern to match against filenames, supports wildcard characters. e.g., '--exclude *.safetensor', '--include vae/*'.--hf_username   (Optional) Hugging Face username for authentication. **NOT EMAIL**.--hf_token      (Optional) Hugging Face token for authentication.--tool          (Optional) Download tool to use. Can be aria2c (default) or wget.-x              (Optional) Number of download threads for aria2c. Defaults to 4.--dataset       (Optional) Flag to indicate downloading a dataset.--local-dir     (Optional) Local directory path where the model or dataset will be stored.Example:hfd bigscience/bloom-560m --exclude *.safetensorshfd meta-llama/Llama-2-7b --hf_username myuser --hf_token mytoken -x 4hfd lavita/medical-qa-shared-task-v1-toy --dataset
EOFexit 1
}MODEL_ID=$1
shift# Default values
TOOL="aria2c"
THREADS=4
HF_ENDPOINT=${HF_ENDPOINT:-"https://huggingface.co"}while [[ $# -gt 0 ]]; docase $1 in--include) INCLUDE_PATTERN="$2"; shift 2 ;;--exclude) EXCLUDE_PATTERN="$2"; shift 2 ;;--hf_username) HF_USERNAME="$2"; shift 2 ;;--hf_token) HF_TOKEN="$2"; shift 2 ;;--tool) TOOL="$2"; shift 2 ;;-x) THREADS="$2"; shift 2 ;;--dataset) DATASET=1; shift ;;--local-dir) LOCAL_DIR="$2"; shift 2 ;;*) shift ;;esac
done# Check if aria2, wget, curl, git, and git-lfs are installed
check_command() {if ! command -v $1 &>/dev/null; thenecho -e "${RED}$1 is not installed. Please install it first.${NC}"exit 1fi
}# Mark current repo safe when using shared file system like samba or nfs
ensure_ownership() {if git status 2>&1 | grep "fatal: detected dubious ownership in repository at" > /dev/null; thengit config --global --add safe.directory "${PWD}"printf "${YELLOW}Detected dubious ownership in repository, mark ${PWD} safe using git, edit ~/.gitconfig if you want to reverse this.\n${NC}" fi
}[[ "$TOOL" == "aria2c" ]] && check_command aria2c
[[ "$TOOL" == "wget" ]] && check_command wget
check_command curl; check_command git; check_command git-lfs[[ -z "$MODEL_ID" || "$MODEL_ID" =~ ^-h ]] && display_helpif [[ -z "$LOCAL_DIR" ]]; thenLOCAL_DIR="${MODEL_ID#*/}"
fiif [[ "$DATASET" == 1 ]]; thenMODEL_ID="datasets/$MODEL_ID"
fi
echo "Downloading to $LOCAL_DIR"if [ -d "$LOCAL_DIR/.git" ]; thenprintf "${YELLOW}%s exists, Skip Clone.\n${NC}" "$LOCAL_DIR"cd "$LOCAL_DIR" && ensure_ownership && GIT_LFS_SKIP_SMUDGE=1 git pull || { printf "${RED}Git pull failed.${NC}\n"; exit 1; }
elseREPO_URL="$HF_ENDPOINT/$MODEL_ID"GIT_REFS_URL="${REPO_URL}/info/refs?service=git-upload-pack"echo "Testing GIT_REFS_URL: $GIT_REFS_URL"response=$(curl -s -o /dev/null -w "%{http_code}" "$GIT_REFS_URL")if [ "$response" == "401" ] || [ "$response" == "403" ]; thenif [[ -z "$HF_USERNAME" || -z "$HF_TOKEN" ]]; thenprintf "${RED}HTTP Status Code: $response.\nThe repository requires authentication, but --hf_username and --hf_token is not passed. Please get token from https://huggingface.co/settings/tokens.\nExiting.\n${NC}"exit 1fiREPO_URL="https://$HF_USERNAME:$HF_TOKEN@${HF_ENDPOINT#https://}/$MODEL_ID"elif [ "$response" != "200" ]; thenprintf "${RED}Unexpected HTTP Status Code: $response\n${NC}"printf "${YELLOW}Executing debug command: curl -v %s\nOutput:${NC}\n" "$GIT_REFS_URL"curl -v "$GIT_REFS_URL"; printf "\n${RED}Git clone failed.\n${NC}"; exit 1fiecho "GIT_LFS_SKIP_SMUDGE=1 git clone $REPO_URL $LOCAL_DIR"GIT_LFS_SKIP_SMUDGE=1 git clone $REPO_URL $LOCAL_DIR && cd "$LOCAL_DIR" || { printf "${RED}Git clone failed.\n${NC}"; exit 1; }ensure_ownershipwhile IFS= read -r file; dotruncate -s 0 "$file"done <<< $(git lfs ls-files | cut -d ' ' -f 3-)
fiprintf "\nStart Downloading lfs files, bash script:\ncd $LOCAL_DIR\n"
files=$(git lfs ls-files | cut -d ' ' -f 3-)
declare -a urlswhile IFS= read -r file; dourl="$HF_ENDPOINT/$MODEL_ID/resolve/main/$file"file_dir=$(dirname "$file")mkdir -p "$file_dir"if [[ "$TOOL" == "wget" ]]; thendownload_cmd="wget -c \"$url\" -O \"$file\""[[ -n "$HF_TOKEN" ]] && download_cmd="wget --header=\"Authorization: Bearer ${HF_TOKEN}\" -c \"$url\" -O \"$file\""elsedownload_cmd="aria2c --console-log-level=error --file-allocation=none -x $THREADS -s $THREADS -k 1M -c \"$url\" -d \"$file_dir\" -o \"$(basename "$file")\""[[ -n "$HF_TOKEN" ]] && download_cmd="aria2c --header=\"Authorization: Bearer ${HF_TOKEN}\" --console-log-level=error --file-allocation=none -x $THREADS -s $THREADS -k 1M -c \"$url\" -d \"$file_dir\" -o \"$(basename "$file")\""fi[[ -n "$INCLUDE_PATTERN" && ! "$file" == $INCLUDE_PATTERN ]] && printf "# %s\n" "$download_cmd" && continue[[ -n "$EXCLUDE_PATTERN" && "$file" == $EXCLUDE_PATTERN ]] && printf "# %s\n" "$download_cmd" && continueprintf "%s\n" "$download_cmd"urls+=("$url|$file")
done <<< "$files"for url_file in "${urls[@]}"; doIFS='|' read -r url file <<< "$url_file"printf "${YELLOW}Start downloading ${file}.\n${NC}" file_dir=$(dirname "$file")if [[ "$TOOL" == "wget" ]]; then[[ -n "$HF_TOKEN" ]] && wget --header="Authorization: Bearer ${HF_TOKEN}" -c "$url" -O "$file" || wget -c "$url" -O "$file"else[[ -n "$HF_TOKEN" ]] && aria2c --header="Authorization: Bearer ${HF_TOKEN}" --console-log-level=error --file-allocation=none -x $THREADS -s $THREADS -k 1M -c "$url" -d "$file_dir" -o "$(basename "$file")" || aria2c --console-log-level=error --file-allocation=none -x $THREADS -s $THREADS -k 1M -c "$url" -d "$file_dir" -o "$(basename "$file")"fi[[ $? -eq 0 ]] && printf "Downloaded %s successfully.\n" "$url" || { printf "${RED}Failed to download %s.\n${NC}" "$url"; exit 1; }
doneprintf "${GREEN}Download completed successfully.\n${NC}"
http://www.lryc.cn/news/431941.html

相关文章:

  • linux proxy 【linux 代理】
  • AcWing907. 区间覆盖
  • Unity TMP (TextMeshPro) 更新中文字符集
  • Leetcode3259. 超级饮料的最大强化能量
  • Java题集(由入门到精通)03
  • zblog自动生成文章插件(百度AI写作配图,图文并茂)
  • 华为 HCIP-Datacom H12-821 题库 (4)
  • 使用seq_file
  • 期货赫兹量化-种群优化算法:进化策略,(μ,λ)-ES 和 (μ+λ)-ES
  • pytest实战演练
  • 7、关于LoFTR
  • 硬件工程师笔试面试知识器件篇——电感
  • 代码随想录八股训练营第三十六天| C++
  • 学习计算机网络
  • Django发送邮件
  • T7:咖啡豆识别
  • 【MATLAB】FIR滤波器的MATLAB实现
  • 【RabbitMQ之一:windows环境下安装RabbitMQ】
  • ISO26262和Aspice之间的关联
  • 对极约束及其性质 —— 公式详细推导
  • 【论文精读】SCINet-基于降采样和交互学习的时序卷积模型
  • 深度学习与大模型第1课环境搭建
  • JDK新特性
  • 数据处理与数据填充在Pandas中的应用
  • 【百日算法计划】:每日一题,见证成长(010)
  • 【WPF】WPF学习之【二】布局学习
  • KEIL中编译51程序 算法计算异常的疑问
  • pikachu文件包含漏洞靶场
  • 基于DPU与SmartNIC的K8s Service解决方案
  • SLM561A​​系列 60V 10mA到50mA线性恒流LED驱动芯片 为智能家居照明注入新活力