当前位置: 首页 > news >正文

C++中使用Essentia实现STFT/ISTFT

最近在做一个项目,需要将音频送入 AI 模型进行处理。整个流程包括:

  • .wav 文件中加载音频;

  • 进行 短时傅里叶变换(STFT)

  • 将变换结果输入模型;

  • 使用 逆STFT(ISTFT) 重建回时域信号。

作为一个长期从事图像方向的 CVer,我对音频领域相对陌生。在调研后发现,和计算机视觉相比,优质的音频处理库屈指可数。而手撸 STFT / ISTFT 又繁琐、容易出错。

最终,我选择了开源音频分析库 Essentia,其功能强大、API 结构清晰,非常适合科研和快速原型验证。本文将分享如何在 C++ 中基于 Essentia 实现 STFT / ISTFT,并完成音频的重建。

🔍 什么是 Essentia?

Essentia 是由巴塞罗那庞培法布拉大学音乐技术小组(MTG)开发的开源 C++ 音频分析库,发布于 AGPLv3 许可下。

Essentia 提供了一套丰富的算法模块,覆盖以下能力:

  • 音频 I/O 和预处理;

  • 数字信号处理(DSP)基础块;

  • 声学和音乐特征提取(如光谱、音调、节奏、情绪等);

  • Python 包装与 Vamp 插件支持,便于快速原型与可视化。

虽然资料较少、使用门槛稍高,但其结构良好,非常适合做科研实验或工业应用中的音频前处理部分。

🛠 编译 Essentia(推荐 Docker 环境)

手动编译步骤如下:

git clone https://github.com/MTG/essentia.git
cd essentia
packaging/build_3rdparty_static_debian.sh

⚠️ 如果第三方依赖下载失败,可以手动修改 packaging/debian_3rdparty/ 下的对应脚本,将源码下载到指定目录。

接着执行:

./waf configure --with-static-examples
./waf

如果你希望跳过编译,可以从我上传的打包资源中下载使用。

📦 在 C++ 中使用 STFT

AlgorithmFactory& factory = standard::AlgorithmFactory::instance();// 1. 加载音频
Algorithm* loader = factory.create("MonoLoader", "filename", "test.wav", "sampleRate", 16000);
vector<Real> audio;
loader->output("audio").set(audio);
loader->compute();
delete loader;

设置参数:

const int frameSize = 320; 
const int hopSize = 80;    // 25% overlap

STFT 流程:

Algorithm* frameCutter = factory.create("FrameCutter","frameSize", frameSize,"hopSize", hopSize);
Algorithm* windowing = factory.create("Windowing","type", "hann","normalized", false);
Algorithm* fft = factory.create("FFT", "size", frameSize);frameCutter->input("signal").set(audio);
vector<Real> frame, windowedFrame;
vector<complex<Real>> fftFrame;frameCutter->output("frame").set(frame);
windowing->input("frame").set(frame);
windowing->output("frame").set(windowedFrame);
fft->input("frame").set(windowedFrame);
fft->output("fft").set(fftFrame);vector<vector<complex<Real>>> stftResult;while (true) {frameCutter->compute();if (frame.empty()) break;windowing->compute();fft->compute();stftResult.push_back(fftFrame);
}

🔁 实现 ISTFT

由于 Essentia 并未提供完整的 Overlap-Add 封装,我们需要仿照 librosa 逻辑手动实现。

Algorithm* ifft = factory.create("IFFT", "size", frameSize);vector<Real> window = computeLibrosaHann(frameSize);
Real windowSum = 0.0;
for (int i = 0; i < hopSize; ++i) {windowSum += window[i] * window[i];
}
const Real compensation = 1.0 / windowSum;vector<Real> reconstructedAudio(originalLength, 0.0);
vector<Real> ifftOutputFrame(frameSize);for (int i = 0; i < stftResult.size(); ++i) {ifft->input("fft").set(stftResult[i]);ifft->output("frame").set(ifftOutputFrame);ifft->compute();int pos = i * hopSize;for (int n = 0; n < frameSize && pos + n < reconstructedAudio.size(); ++n) {reconstructedAudio[pos + n] += ifftOutputFrame[n] * window[n] * compensation;}
}

🧹 后处理与音频保存

removeDCOffset(reconstructedAudio);
conservativeNormalize(reconstructedAudio, 0.99);// 保存音频
Algorithm* writer = factory.create("MonoWriter","filename", "./reconstructed.wav","sampleRate", 16000);
writer->input("audio").set(reconstructedAudio);
writer->compute();
delete writer;

🧪 代码完整示例

#include <iostream>
#include <essentia/essentia.h>
#include <essentia/algorithmfactory.h>
#include <essentia/pool.h>
#include <Eigen/Dense>
#include <unsupported/Eigen/CXX11/Tensor>
#include <numeric>
#include <cmath>using namespace std;
using namespace essentia;
using namespace essentia::standard;// Librosa风格的汉宁窗计算
vector<Real> computeLibrosaHann(int size) {vector<Real> window(size);for (int i = 0; i < size; ++i) {window[i] = sin(M_PI * i / (size - 1)) * sin(M_PI * i / (size - 1));}return window;
}// 移除直流偏移
void removeDCOffset(vector<Real>& audio) {Real dcOffset = accumulate(audio.begin(), audio.end(), 0.0) / audio.size();for (auto& sample : audio) {sample -= dcOffset;}
}// 保守归一化
void conservativeNormalize(vector<Real>& audio, Real targetPeak = 0.99) {Real peak = *max_element(audio.begin(), audio.end(),[](Real a, Real b) { return abs(a) < abs(b); });if (abs(peak) > 1e-6) {  // 避免除以0for (auto& sample : audio) {sample *= (targetPeak / abs(peak));}}
}int main() {// 初始化Essentiaessentia::init();AlgorithmFactory& factory = standard::AlgorithmFactory::instance();// 1. 加载音频Algorithm* loader = factory.create("MonoLoader","filename", "/work/000002.wav","sampleRate", 16000);vector<Real> audio;loader->output("audio").set(audio);loader->compute();delete loader;cout << "Loaded audio with " << audio.size() << " samples" << endl;// 2. STFT参数设置const int frameSize = 320;      // 与librosa默认值一致const int hopSize = 80;        // 25%重叠const int originalLength = audio.size();const string windowType = "hann";// 3. STFT处理Algorithm* frameCutter = factory.create("FrameCutter","frameSize", frameSize,"hopSize", hopSize);Algorithm* windowing = factory.create("Windowing","type", windowType,"normalized", false,"zeroPhase", false);Algorithm* fft = factory.create("FFT", "size", frameSize);frameCutter->input("signal").set(audio);vector<Real> frame, windowedFrame;frameCutter->output("frame").set(frame);windowing->input("frame").set(frame);windowing->output("frame").set(windowedFrame);vector<complex<Real>> fftFrame;fft->input("frame").set(windowedFrame);fft->output("fft").set(fftFrame);vector<vector<complex<Real>>> stftResult;// STFT处理循环while (true) {frameCutter->compute();if (frame.empty()) break;windowing->compute();fft->compute();stftResult.push_back(fftFrame);}delete frameCutter;delete windowing;delete fft;cout << "STFT completed. Frames: " << stftResult.size() << ", Bins: " << (stftResult.empty() ? 0 : stftResult[0].size()) << endl;// 4. ISTFT处理(librosa风格)Algorithm* ifft = factory.create("IFFT", "size", frameSize);// 计算窗函数和补偿因子vector<Real> window = computeLibrosaHann(frameSize);Real windowSum = 0.0;for (int i = 0; i < hopSize; ++i) {windowSum += window[i] * window[i];}const Real compensation = 1.0 / windowSum;// 重建音频初始化vector<Real> reconstructedAudio(originalLength, 0.0);vector<Real> ifftOutputFrame(frameSize);// 处理每一帧for (int i = 0; i < stftResult.size(); ++i) {// IFFT变换ifft->input("fft").set(stftResult[i]);ifft->output("frame").set(ifftOutputFrame);ifft->compute();// 重叠相加(librosa风格)int pos = i * hopSize;for (int n = 0; n < frameSize && pos + n < reconstructedAudio.size(); ++n) {reconstructedAudio[pos + n] += ifftOutputFrame[n] * window[n] * compensation;}}delete ifft;// 5. 后处理removeDCOffset(reconstructedAudio);conservativeNormalize(reconstructedAudio, 0.99);// 6. 结果验证// 计算RMS能量比auto computeRMS = [](const vector<Real>& x) {return sqrt(accumulate(x.begin(), x.end(), 0.0, [](Real sum, Real val) { return sum + val*val; }) / x.size());};Real originalRMS = computeRMS(audio);Real reconstructedRMS = computeRMS(reconstructedAudio);cout << "Volume ratio (reconstructed/original): " << reconstructedRMS / originalRMS << endl;// 7. 保存结果Algorithm* writer = factory.create("MonoWriter","filename", "./reconstructed.wav","sampleRate", 16000);writer->input("audio").set(reconstructedAudio);writer->compute();delete writer;essentia::shutdown();return 0;
}

🧩 CMake 配置示例

Essentia 依赖众多第三方库,下面是一个完整的 CMakeLists.txt 配置参考:

cmake_minimum_required(VERSION 3.10)
project(gcrn_cpp)set(CMAKE_CXX_STANDARD 17)file(GLOB_RECURSE CORE_SOURCE_FILES ${CMAKE_CURRENT_LIST_DIR}/source/*.cpp)
include_directories(${CMAKE_CURRENT_LIST_DIR}/3rdparty)
link_directories(${CMAKE_CURRENT_LIST_DIR}/libs/)# 设置 PKG_CONFIG 路径
set(ENV{PKG_CONFIG_PATH} "/essentia-master/packaging/debian_3rdparty/lib/pkgconfig:$ENV{PKG_CONFIG_PATH}")
find_package(PkgConfig REQUIRED)# 依赖库查找
pkg_check_modules(SWRESAMPLE REQUIRED libswresample)
pkg_check_modules(AVCODEC REQUIRED libavcodec)
pkg_check_modules(AVFORMAT REQUIRED libavformat)
pkg_check_modules(SAMPLERATE REQUIRED samplerate)
pkg_check_modules(FFTW3 REQUIRED fftw3f)
pkg_check_modules(CHROMAPRINT REQUIRED libchromaprint)
pkg_check_modules(TAGLIB REQUIRED taglib)
pkg_check_modules(YAML REQUIRED yaml-0.1)
pkg_check_modules(EIGEN3 REQUIRED eigen3)include_directories(${EIGEN3_INCLUDE_DIRS})
link_directories(${SAMPLERATE_LIBRARY_DIRS})add_executable(gcrn_cpp main.cpp ${CORE_SOURCE_FILES})
target_link_libraries(gcrn_cpp PRIVATEessentia${CHROMAPRINT_LIBRARIES}${SWRESAMPLE_LIBRARIES}${SAMPLERATE_LIBRARIES}${AVFORMAT_LIBRARIES}${AVCODEC_LIBRARIES}${AVUTIL_LIBRARIES}${FFTW3_LIBRARIES}${TAGLIB_LIBRARIES}${YAML_LIBRARIES}pthread dl m z
)

📚 参考资料

  • 官方仓库:https://github.com/MTG/essentia

  • FAQ 页面:Frequently Asked Questions — Essentia 2.1-beta6-dev documentation

  • Librosa STFT 文档:https://librosa.org/doc/main/generated/librosa.stft.html

http://www.lryc.cn/news/599802.html

相关文章:

  • C++中new和delete的多重面孔:operator new、new operator与placement new解析
  • 机器学习-SVM支持向量机
  • Zookeeper学习专栏(十):核心流程剖析之服务启动、请求处理与选举协议
  • 【Linux】进程切换与优先级
  • Metaspace耗尽导致OOM问题
  • 【运维自动化-标准运维】各类全局变量使用说明(下)
  • 伯俊科技× OB Cloud:零售业落地AI的“三步走”渐进式发展实践
  • 企业微信H5应用OAuth2登录,企业微信授权登录
  • 国产DevOps平台Gitee:如何重塑中国企业研发效能新格局
  • 如何在 Ubuntu 24.04 或 22.04 上安装和使用 GDebi
  • Qt 反射机制与动态属性系统
  • UserWarning: Workbook contains no default style, apply openpyxl‘s default warn
  • ReAct Agent(LangGraph实现)
  • 04-netty基础-Reactor三种模型
  • 无需 Root 关闭联网验证 随意修改手机名称(适用于OPPO、一加、真我)
  • 【笔记】Handy Multi-Agent Tutorial 第四章: CAMEL框架下的RAG应用 (简介)
  • RocketMQ 5.3.0 ARM64 架构安装部署指南
  • 详解FreeRTOS开发过程(八)-- 时间标志
  • 【电赛学习笔记】MaxiCAM 项目实践——与单片机的串口通信
  • ESP32学习笔记_Components(1)——使用LED Strip组件点亮LED灯带
  • Yolov8/Yolov11实例分割训练自有数据集
  • AWS WebRTC:我们的业务模式
  • 壁纸管理 API 文档
  • MybatisPlus-17.扩展功能-JSON处理器
  • Asp.net core mvc中TagHelper的GetChildContentAsync和Content区别
  • 【04】C#入门到精通——C# 程序错误处理, try catch 捕获异常,避免程序崩溃
  • Android 的16 KB内存页设备需要硬件支持吗,还是只需要手机升级到Android15系统就可以
  • [python][基础]Flask 技术栈
  • c盘temp文件夹可以删除吗?C盘空间清理指南来了
  • epoll_event数据结构及使用案例详解