(具体后续整理完善)
Tensorflow speech_cammd 训练自己的数据集
tensorflow 采用hash
# train python train.py --data_url="" \ --data_dir=data --wanted_words="words1,words2" \ # build pb python freeze.py --start_checkpoint=/home/ysr/project/rknn/model/conv.ckpt-1200 \ --output_file=/home/ysr/project/rknn/model/my_frozen_graph.pb
RKNN-tools 导出RKNN
import numpy as np import re import math import random #import cv2 from rknn.api import RKNN if __name__ == '__main__': # Create RKNN object rknn = RKNN(verbose=False, verbose_file='./speech_command_build.log') # Config for Model Input PreProcess #rknn.config(quantized_dtype='dynamic_fixed_point-8') #rknn.config(quantized_dtype='asymmetric_quantized-u8') rknn.config(target_platform=['rv1126']) # Load TensorFlow Model print('--> Loading model') rknn.load_tensorflow(tf_pb='./model/my_frozen_graph.pb', inputs=['Reshape'], outputs=['labels_softmax'], input_size_list=[[1,3920]]) # 40 x 98 print('done') # Build Model print('--> Building model') rknn.build(do_quantization=False, dataset='./dataset.txt', pre_compile=False) print('done') # Export RKNN Model #rknn.export_rknn('./speech_command_quantized.rknn') rknn.export_rknn('./model/speech_command.rknn') #import time #time.sleep(100)
导出rknn 测试
from rknn.api import RKNN from tensorflow.python.ops import gen_audio_ops as contrib_audio import tensorflow as tf import numpy as np wav_file = open("model/recoard.wav", "rb") wav_data = wav_file.read() #decoded_sample_data = audio_ops.decode_wav(wav_data, desired_channels=1, desired_samples=16000, name='decoded_sample_data') #spectrogram = audio_ops.audio_spectrogram(decoded_sample_data.audio, window_size=480, stride=160, magnitude_squared=True) #fingerprint_input = audio_ops.mfcc(decoded_sample_data, 16000, dct_coefficient_count=40) # 40 取40 个点 # shape = (1,98, 40) # 一维矩阵 40 个 decoded_sample_data = contrib_audio.decode_wav(wav_data, desired_channels=1, desired_samples=16000, name='decoded_sample_data') spectrogram = contrib_audio.audio_spectrogram(decoded_sample_data.audio, window_size=480, stride=160, magnitude_squared=True) fingerprint_input = contrib_audio.mfcc(spectrogram, 16000, dct_coefficient_count=40) print(fingerprint_input) fingerprint_input_npy = fingerprint_input.numpy() print(fingerprint_input_npy.size) #np.save('fingerprint_input.npy',fingerprint_input_npy) #np.savetxt('fingerprint_input.txt',fingerprint_input_npy) #print(fingerprint_input_npy) # Create RKNN object rknn = RKNN() # Load TensorFlow Model ret = rknn.load_rknn(path='model/speech_command.rknn') print("rknn runtime start") ret = rknn.init_runtime(perf_debug=True) #sdk_version = rknn.get_sdk_version() #print(sdk_version) outputs, = rknn.inference(inputs=fingerprint_input_npy,data_type='float32') print("rknn runtime stop") #outputs = rknn.inference(inputs=[fingerprint_input_npy]) # Release RKNN Context rknn.release() def load_labels(filename): """Read in labels, one label per line.""" return [line.rstrip() for line in tf.io.gfile.GFile(filename)] ## 后处理 labels = load_labels("model/conv_labels.txt") predictions = np.array(outputs) print(outputs) print(predictions) top_k = predictions[0].argsort()[-3][:-1] print(top_k) for node_id in top_k: human_string = labels[node_id] score = predictions[0][node_id] print('%s (score = %.5f)' % (human_string, score))
Tensorflow 提取MFCC 算法和 Spectrogram
不依赖tensorflow.so
NPU 调用
/* * @Author: your name * @Date: 2021-08-02 17:58:26 * @LastEditTime: 2021-08-05 13:50:22 * @LastEditors: Please set LastEditors * @Description: In User Settings Edit * @FilePath: \deploy\tfversion\demo.cc */ #include <cmath> #include <cstdint> #include <fstream> #include <iostream> #include <sstream> #include <vector> #include <iomanip> #include <chrono> #include <dirent.h> #include <cstring> //#include "wav_header.h" #include "mfcc.h" #include "spectrogram.h" #include "NanoDet.hpp" #include "knn_api.h"
extern "C"
{
//#include "rknn_inference.h"
}
static void printRKNNTensor(rknn_tensor_attr *attr);
static unsigned char *load_model(const char *filename, int *model_size);
int rknn_start(float *data, uint32_t size);
/* Steps to calculate MFCC
Step1, load wav file prepare audio data
Step2, Spectrogram sgram; sgram.Initialize(int window_length, int
step_length); in spectrogram.cc window_length=window_size=480,
step_length=stride=160(tf.audio_spectrogram); use
ComputeSquaredMagnitudeSpectrogram(input, output), get the final spectrogram
results.
Step3, then use mfcc to compute mfcc features in mfcc.cc
mfcc.Initialize(int input_length, double input_sample_rate),
input_length=input.size(),
*/
static const int16_t kint16min = static_cast<int16_t>(~0x7FFF);
static const int16_t kint16max = static_cast<int16_t>(0x7FFF);
typedef std::chrono::high_resolution_clock Clock;
typedef std::chrono::milliseconds Milliseconds;
inline float Int16SampleToFloat(int16_t data) {
constexpr float kMultiplier = 1.0f / (1 << 15);
return data * kMultiplier;
}
//inline int16_t FloatToInt16Sample(float data) {
// constexpr float kMultiplier = 1.0f * (1 << 15);
// return std::min<float>(
// std::max<float>(roundf(data * kMultiplier), kint16min), kint16max);
//}
struct WAVHeader {
/* RIFF Chunk Descriptor */
uint8_t RIFF[4]; // RIFF Header Magic header,4×Ö½Ú´ó¶ËÐò¡£Îļþ´Ó´Ë´¦¿ªÊ¼£¬¶ÔÓÚWAV»òAVIÎļþ£¬ÆäÖµ×ÜΪ¡°RIFF¡±
uint32_t ChunkSize; // RIFF Chunk Size,4×Ö½ÚС¶ËÐò¡£±íʾÎļþ×Ü×Ö½ÚÊý¼õ8£¬¼õÈ¥µÄ8×Ö½Ú±íʾ,ChunkIDÓëChunkSize±¾ÉíËùÕ¼×Ö½ÚÊý
uint8_t WAVE[4]; // WAVE Header,4×Ö½Ú´ó¶ËÐò¡£¶ÔÓÚWAVÎļþ£¬ÆäÖµ×ÜΪ¡°WAVE¡±
/* "fmt" sub-chunk */
uint8_t fmt[4]; // FMT header, 4×Ö½Ú´ó¶ËÐò¡£ÆäÖµ×ÜΪ¡°fmt ¡±£¬±íʾFormat Chunk´Ó´Ë´¦¿ªÊ¼
uint32_t Subchunk1Size; // Size of the fmt chunk,4×Ö½ÚС¶ËÐò¡£±íʾFormat ChunkµÄ×Ü×Ö½ÚÊý¼õ8
uint16_t AudioFormat; // Audio format 1=PCM,6=mulaw,7=alaw,257=IBM Mu-Law, 258=IBM A-Law, 259=ADPCM,2×Ö½ÚС¶ËÐò
uint16_t NumOfChannels; // Number of channels 1=Mono 2=Stereo,2×Ö½ÚС¶ËÐò
uint32_t SamplesPerSec; // Sampling Frequency in Hz,4×Ö½ÚС¶ËÐò,±íʾÔÚÿ¸öͨµÀÉÏÿÃë°üº¬¶àÉÙÖ¡
uint32_t bytesPerSec; // bytes per second,4×Ö½ÚС¶ËÐò¡£´óСµÈÓÚSampleRate * BlockAlign£¬±íʾÿÃë¹²°üº¬¶àÉÙ×Ö½Ú
uint16_t blockAlign; // 2=16-bit mono, 4=16-bit stereo,2×Ö½ÚС¶ËÐò¡£´óСµÈÓÚNumChannels * BitsPerSample / 8£¬ ±íʾÿ֡µÄ¶àͨµÀ×Ü×Ö½ÚÊý
uint16_t bitsPerSample; // Number of bits per sample,2×Ö½ÚС¶ËÐò¡£±íʾÿ֡°üº¬¶àÉÙ±ÈÌØ
/* "data" sub-chunk */
uint8_t Subchunk2ID[4]; // "data" string,4×Ö½Ú´ó¶ËÐò¡£ÆäÖµ×ÜΪ¡°data¡±£¬±íʾData Chunk´Ó´Ë´¦¿ªÊ¼
uint32_t Subchunk2Size; // Sampled data length, 4×Ö½ÚС¶ËÐò¡£±íʾdataµÄ×Ü×Ö½ÚÊý
};
// Read audio data from wav file like tensorflow cc
size_t ReadWav(const std::string &filePath, std::vector<double> &data,
uint32_t &decoded_sample_count, uint16_t &decoded_channel_count,
uint32_t &decoded_sample_rate) {
std::ifstream inFile(filePath, std::ifstream::in | std::ifstream::binary);
size_t ret = 0;
// read wav header and check infos
WAVHeader hdr;
int headerSize = sizeof(WAVHeader);
inFile.read((char *)&hdr, headerSize);
// Check audio format
if (hdr.AudioFormat != 1 || hdr.bitsPerSample != 16) {
std::cerr << "Unsupported audio format, use 16 bit PCM Wave"
<< std::endl;
return 1;
}
// Check sampling rate
decoded_sample_rate = hdr.SamplesPerSec;
if (hdr.SamplesPerSec != 16000) {
std::cerr << "Sampling rate mismatch: Found " << hdr.SamplesPerSec
<< " instead of " << 16000 << std::endl;
return 1;
}
// Check sampling rate:
decoded_channel_count = hdr.NumOfChannels;
if (hdr.NumOfChannels != 1) {
std::cerr << hdr.NumOfChannels
<< " channel files are unsupported. Use mono." << std::endl;
return 1;
}
if (!inFile.is_open()) {
std::cout << std::endl << "Can not open the WAV file !!" << std::endl;
return 1;
}
// read real audio data
// calculate how many samples
uint32_t expected_bytes = (hdr.bitsPerSample * hdr.NumOfChannels + 7) / 8;
std::cout << "chunk_size: " << hdr.ChunkSize
<< "\t bytes_per_seconds: " << hdr.bytesPerSec
<< "\texpected bytes: " << expected_bytes
<< "bits_per_samples: " << hdr.bitsPerSample << std::endl;
decoded_sample_count = hdr.ChunkSize / expected_bytes;
// calculate how many data in audio
uint32_t data_count = decoded_sample_count * hdr.NumOfChannels;
std::vector<float> float_values;
float_values.resize(data_count);
std::cout << "Total samples in wav:" << data_count << std::endl;
uint16_t bufferLength = data_count;
int16_t *buffer = new int16_t[bufferLength];
int bufferBPS = (sizeof buffer[0]);
// Read all data into float_values
inFile.read((char *)buffer, bufferLength * bufferBPS);
for (int i = 0; i < bufferLength; i++)
float_values[i] = Int16SampleToFloat(buffer[i]);
// data[i] = Int16SampleToFloat(buffer[i]);
delete[] buffer;
//--------------------------------------
inFile.close();
// Convert from float to double for the output value.
// TODO, here do audio clip the same as in data_process
int clip_duration_ms = 1000; // only 1000ms
int desired_samples = int(decoded_sample_rate * clip_duration_ms / 1000);
// data.resize(float_values.size());
data.resize(desired_samples);
std::cout << "Choose process samples size was: " << desired_samples
<< std::endl;
for (int i = 0; i < desired_samples; ++i) {
if(i >= float_values.size()) {
data[i] = 0.0; // padding
} else {
data[i] = float_values[i];
}
}
return 0;
}
// Convert vector of double to string (for writing MFCC file output)
std::string vector_to_string(std::vector<double> vec,
const std::string &delimiter) {
std::stringstream vecStream;
for (int i = 0; i < vec.size() - 1; i++) {
vecStream << vec[i];
vecStream << delimiter;
}
vecStream << vec.back();
vecStream << "\n";
return vecStream.str();
}
std::string vector_vector_string(std::vector<std::vector<double>> vec,
const std::string &delimiter) {
std::string s1 = "";
std::stringstream vec_stream;
for (int i = 0; i < vec.size() - 1; ++i) {
s1 = vector_to_string(vec[i], delimiter);
vec_stream << s1;
vec_stream << "\n";
}
s1 = vector_to_string(vec.back(), delimiter);
vec_stream << s1;
// vec_stream << "\n";
return vec_stream.str();
}
int ExtractMfccFeature(Mfcc &mfcc, Spectrogram &sgram,
std::vector<double> audio_samples,
uint32_t &sample_rate,
std::vector<std::vector<double>> &mfcc_features) {
// Step1, convert audio data to spectrogram
std::vector<std::vector<double>> spectrogram_output;
// *NOTE*, fft state
sgram.Reset();
sgram.ComputeSquaredMagnitudeSpectrogram(audio_samples, &spectrogram_output);
std::cout << "spectrogram size: " << spectrogram_output.size()
<< "\tinternal vector size: " << spectrogram_output[0].size()
<< std::endl;
// std::string res2 = vector_vector_string(spectrogram_output, " ");
// std::cout << "Specrogram results: \n" << res2 << std::endl;
// Step2, calculate mfcc features with spectrogram
// std::vector<std::vector<double>> output_data;
// *NOTE*, here we only support 1-channel audio data
int spectrogram_channels = spectrogram_output[0].size();
mfcc.Initialize(spectrogram_channels, sample_rate);
for (int i = 0; i < spectrogram_output.size(); ++i) {
std::vector<double> mfcc_out;
mfcc.Compute(spectrogram_output[i], &mfcc_out);
// assert(mfcc_out.size() == dct_coefficient_count_);
mfcc_features.push_back(mfcc_out);
}
// print results
std::cout << "mfcc out total frames: " << mfcc_features.size()
<< " frame dimension: " << mfcc_features[0].size() << std::endl;
// std::string mfcc_str = vector_vector_string(output_data, " ");
// std::cout << "Mfcc feature results: \n" << mfcc_str << std::endl;
return 0; // success
}
/**
* @description: String split with specific pattern
* @param str: input string to be split
* @param vec: split results
* @param pattern: split delimiter
* @return
*/
void SplitWord(const std::string &str, std::vector<std::string>& vec, const std::string& pattern) {
std::string::size_type pos1, pos2;
pos1 = 0;
pos2 = str.find(pattern);
while (std::string::npos != pos2) {
vec.push_back(str.substr(pos1, pos2 - pos1));
pos1 = pos2 + pattern.size();
pos2 = str.find(pattern, pos1);
}
if (pos1 != str.length()) {
vec.push_back(str.substr(pos1));
}
}
static int rknn_GetTop
(
float *pfProb,
float *pfMaxProb,
uint32_t *pMaxClass,
uint32_t outputCount,
uint32_t topNum
)
{
uint32_t i, j;
#define MAX_TOP_NUM 20
if (topNum > MAX_TOP_NUM) return 0;
memset(pfMaxProb, 0, sizeof(float) * topNum);
memset(pMaxClass, 0xff, sizeof(float) * topNum);
printf("outputCount %d topNum %d \n", outputCount, topNum);
for (j = 0; j < topNum; j++)
{
for (i=0; i<outputCount; i++)
{
if ((i == *(pMaxClass+0)) || (i == *(pMaxClass+1)) || (i == *(pMaxClass+2)) ||
(i == *(pMaxClass+3)) || (i == *(pMaxClass+4)))
{
continue;
}
if (pfProb[i] > *(pfMaxProb+j))
{
*(pfMaxProb+j) = pfProb[i];
*(pMaxClass+j) = i;
}
}
}
return 1;
}
int processSingleFile(Mfcc &mfcc, Spectrogram &sgram,std::string filename, std::string outfile) {
std::cout << "Start extract audio features from file: " << filename
<< std::endl;
Clock::time_point TStart, TEnd;
TStart = Clock::now();
std::vector<double> audio_samples; // to store audio sample data, norm to -1.0~1.0
uint32_t decoded_sample_count; // how many samples in wav file
uint16_t decoded_channel_count; // how many channels in wav file
uint32_t decoded_sample_rate; // the real sample rate of wav file
size_t ret = ReadWav(filename, audio_samples, decoded_sample_count, decoded_channel_count,
decoded_sample_rate);
if (ret != 0) {
std::cout << "Load audio data error!\n";
return 1;
}
// Get Spectrogram and mfcc features
std::vector<std::vector<double>> mfcc_features;
ret = ExtractMfccFeature(mfcc, sgram,audio_samples, decoded_sample_rate,mfcc_features);
//std::cout<<"mfcc size"<<mfcc_features.size() << " i" << mfcc_features[0].size()<<std::endl;
//rknn_inference(0, mfcc_features.data());
//std::cout<<"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"<<std::endl;
//auto res = net.detect(mfcc_features);
float data[3920] = {0};
printf("mfcc_features.size() * mfcc_features[0].size() %d \n", mfcc_features.size() * mfcc_features[0].size());
TEnd = Clock::now();
Milliseconds ms = std::chrono::duration_cast<Milliseconds>(TEnd - TStart);
std::cout << "Completed audio mfcc feature extraction cost time: "
<< ms.count() << "ms" << std::endl;
// Save mfcc features to files
std::ofstream outfs(outfile);
if (!outfs.is_open()) {
std::cout << "Open outfile " << outfile << "error!\n";
return 1;
}
int count = 0;
outfs << std::fixed << std::setprecision(8);
for(int i=0; i < mfcc_features.size(); ++i) {
for(int j = 0; j < mfcc_features[i].size(); ++j) {
outfs << mfcc_features[i][j] << " ";
data[count] = mfcc_features[i][j];
count ++;
}
outfs << std::endl;
}
printf("count %d \n", count);
rknn_start(data, mfcc_features.size() * mfcc_features[0].size());
outfs.close();
return 0;
}
int processFileList(Mfcc &mfcc, Spectrogram &sgram, std::string wavFolder, std::string outfolder) {
DIR *pDir;
struct dirent *ptr;
std::vector<std::string> files;
std::vector<std::string> outfiles;
if (!(pDir = opendir(wavFolder.c_str()))) {
perror(("Folder " + wavFolder + "doesn't exist!").c_str());
return 1;
}
while ((ptr = readdir(pDir)) != 0) {
if (strcmp(ptr->d_name, ".") != 0 && strcmp(ptr->d_name, "..") != 0) {
// std::cout << ptr->d_name << std::endl;
// extract label
// std::vector<std::string> vec;
// std::string delimiter = "_";
// SplitWord(ptr->d_name, vec, delimiter);
// if (vec.size() != 2) {
// std::cout << "wav file name not contain label: " << ptr->d_name
// << std::endl;
// }
// 5338ca0367ec5ef0d43244cdae31dda7.wav_2
files.push_back(wavFolder + "/" + ptr->d_name);
//mfcc.5338ca0367ec5ef0d43244cdae31dda7.wav_2
outfiles.push_back(outfolder + "/" + "mfcc." + ptr->d_name);
}
}
closedir(pDir);
for (int i = 0; i < files.size(); ++i) {
//processSingleFile(mfcc, sgram, files[i], outfiles[i]);
}
return 0;
}
int rknn_start(float *data, uint32_t size)
{
int ret;
rknn_context ctx;
int model_len = 0;
unsigned char *model;
char* model_path = "./speech_command.rknn";
// Load RKNN Model
model = load_model(model_path, &model_len);
ret = rknn_init(&ctx, model, model_len, 0);
if(ret < 0) {
printf("rknn_init fail! ret=%d\n", ret);
return -1;
}
// Get Model Input Output Info
rknn_input_output_num io_num;
ret = rknn_query(ctx, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(io_num));
if (ret != RKNN_SUCC) {
printf("rknn_query fail! ret=%d\n", ret);
return -1;
}
printf("model input num: %d, output num: %d\n", io_num.n_input, io_num.n_output);
printf("input tensors:\n");
rknn_tensor_attr input_attrs[io_num.n_input];
memset(input_attrs, 0, sizeof(input_attrs));
for (int i = 0; i < io_num.n_input; i++) {
input_attrs[i].index = i;
ret = rknn_query(ctx, RKNN_QUERY_INPUT_ATTR, &(input_attrs[i]), sizeof(rknn_tensor_attr));
if (ret != RKNN_SUCC) {
printf("rknn_query fail! ret=%d\n", ret);
return -1;
}
printRKNNTensor(&(input_attrs[i]));
}
printf("output tensors:\n");
rknn_tensor_attr output_attrs[io_num.n_output];
memset(output_attrs, 0, sizeof(output_attrs));
for (int i = 0; i < io_num.n_output; i++) {
output_attrs[i].index = i;
ret = rknn_query(ctx, RKNN_QUERY_OUTPUT_ATTR, &(output_attrs[i]), sizeof(rknn_tensor_attr));
if (ret != RKNN_SUCC) {
printf("rknn_query fail! ret=%d\n", ret);
return -1;
}
printRKNNTensor(&(output_attrs[i]));
}
// Set Input Data
rknn_input inputs[1];
memset(inputs, 0, sizeof(inputs));
inputs[0].index = 0;
//inputs[0].type = RKNN_TENSOR_UINT8;
//inputs[0].fmt = RKNN_TENSOR_NHWC;
inputs[0].type = RKNN_TENSOR_FLOAT32;
inputs[0].size = size;
inputs[0].fmt = RKNN_TENSOR_NHWC;
inputs[0].buf = data;
ret = rknn_inputs_set(ctx, io_num.n_input, inputs);
if(ret < 0) {
printf("rknn_input_set fail! ret=%d\n", ret);
return -1;
}
// Run
printf("rknn_run\n");
ret = rknn_run(ctx, nullptr);
if(ret < 0) {
printf("rknn_run fail! ret=%d\n", ret);
return -1;
}
// Get Output
rknn_output outputs[1];
memset(outputs, 0, sizeof(outputs));
outputs[0].want_float = 1;
ret = rknn_outputs_get(ctx, 1, outputs, NULL);
if(ret < 0) {
printf("rknn_outputs_get fail! ret=%d\n", ret);
return -1;
}
// Post Process
for (int i = 0; i < io_num.n_output; i++)
{
uint32_t MaxClass[5];
float fMaxProb[5];
float *buffer = (float *)outputs[i].buf;
uint32_t sz = outputs[i].size/4;
printf("outputs[%d].size %d index %d \n",i, outputs[i].size, outputs[i].index);
#if 0
for(int i =0 ; i< sz; i++)
{
//const int idx_class = i;
//fMaxProb[i] = buffer[i];
printf("class %d fMaxProbe %8.6f \n", i, buffer[i]);
}
#endif
rknn_GetTop(buffer, fMaxProb, MaxClass, sz, 5);
printf(" --- Top5 ---\n");
for(int i=0; i<5; i++)
{
printf("%3d: %8.6f\n", MaxClass[i], fMaxProb[i]);
}
}
// Release rknn_outputs
rknn_outputs_release(ctx, 1, outputs);
// Release
if(ctx >= 0) {
rknn_destroy(ctx);
}
if(model) {
free(model);
}
return 0;
}
int main() {
// ----------------Parameters for Spectrogram and MFCC-----------------//
int sample_rate = 16000; // sample rate,default=16000, only support 16khz
int window_size_ms = 30; // default 30ms, keep the same as model train
int window_size_samples = int(sample_rate * window_size_ms / 1000); // 480
int window_stride_ms = 10; // 10ms
int window_stride_samples =
int(sample_rate * window_stride_ms / 1000); // 160
// init rknn
//rknn_mode_load("./speech_command.rknn");
//NanoDet net(model_path);
//net.Input_Output_Configuration();
// Define Spetrogram instance
Spectrogram sgram;
sgram.Initialize(window_size_samples, window_stride_samples);
// Define Mfcc instance
Mfcc mfcc;
// set parameters for mfcc
// Defaults to `20`.The lowest frequency to use when calculating the
// ceptstrum.
double lower_frequency_limit_ = 20; // 20hz
// 4000hz, Defaults to `4000` The highest frequency to use when calculating
// the ceptstrum
double upper_frequency_limit_ = 4000;
// Defaults to `40`.Resolution of the Mel bank used internally.
int filterbank_channel_count_ = 40;
// Defaults to `13`.How many output channels to produce per time slice.
int dct_coefficient_count_ = 40;
mfcc.set_upper_frequency_limit(upper_frequency_limit_);
mfcc.set_lower_frequency_limit(lower_frequency_limit_);
mfcc.set_filterbank_channel_count(filterbank_channel_count_);
mfcc.set_dct_coefficient_count(dct_coefficient_count_);
// TODO, this was calculate by sgram.ComputeSquaredMagnitudeSpectrogram
const int spectrogram_channels = 257;
// const int spectrogram_samples = spectrogram_output.size();
// mfcc.Initialize(spectrogram_channels, sample_rate);
std::string wav_file = "./recoard.wav";
std::string mfcc_out_dir = "./feat.mfcc";
processSingleFile(mfcc, sgram,wav_file, mfcc_out_dir);
// process wav list
//std::string wav_dir = "../audios/";
//std::string feat_dir = "../features/";
//processFileList(mfcc, sgram, wav_dir, feat_dir);
// std::vector<double> vec;
// uint32_t decoded_sample_count;
// uint16_t decoded_channel_count;
// uint32_t decoded_sample_rate;
// ReadWav(wav_file, vec, decoded_sample_count, decoded_channel_count,
// decoded_sample_rate);
// std::string delimiter = " ";
// std::string res = vector_to_string(vec, delimiter);
// std::cout << "Audio data samples size: " << vec.size() << std::endl;
// std::cout << res << std::endl;
// Step2, auido data convert to spectrogram
// std::vector<std::vector<double>> spectrogram_output;
// sgram.ComputeSquaredMagnitudeSpectrogram(vec, &spectrogram_output);
// std::cout << "spectrogram size: " << spectrogram_output.size()
// << "\tinternal vector size: " << spectrogram_output[0].size()
// << std::endl;
// Step3, create mfcc features, we implementation as the tensorflow in
// mfcc_op.cc get spectrogram_channels, the dimension of spectrogram
// spectrogram_samples, how many spectrograms
// audio_channels, audio channels, default=1
// const int spectrogram_channels = spectrogram_output[0].size();
// const int spectrogram_samples = spectrogram_output.size();
// const int audio_channels = 1;
// std::cout << "spectrogram_channels: " << spectrogram_channels
// << "\tspectrogram_samples: " << spectrogram_samples
// << "\taudio_channels: " << audio_channels << std::endl;
// Mfcc mfcc;
// // set parameters for mfcc
// // Defaults to `20`.The lowest frequency to use when calculating the
// // ceptstrum.
// double lower_frequency_limit_ = 20; // 20hz
// // 4000hz, Defaults to `4000` The highest frequency to use when calculating
// // the ceptstrum
// double upper_frequency_limit_ = 4000;
// // Defaults to `40`.Resolution of the Mel bank used internally.
// int filterbank_channel_count_ = 40;
// // Defaults to `13`.How many output channels to produce per time slice.
// int dct_coefficient_count_ = 40;
// mfcc.set_upper_frequency_limit(upper_frequency_limit_);
// mfcc.set_lower_frequency_limit(lower_frequency_limit_);
// mfcc.set_filterbank_channel_count(filterbank_channel_count_);
// mfcc.set_dct_coefficient_count(dct_coefficient_count_);
// mfcc.Initialize(spectrogram_channels, sample_rate);
// // define output for mfcc
// std::vector<std::vector<double>> output_data;
// // *NOTE*, here we only support 1-channel audio data
// for (int i = 0; i < spectrogram_output.size(); ++i) {
// std::vector<double> mfcc_out;
// mfcc.Compute(spectrogram_output[i], &mfcc_out);
// assert(mfcc_out.size() == dct_coefficient_count_);
// output_data.push_back(mfcc_out);
// }
// std::string wav_file = "/data1/yw.shi/data/audio/xiaoshun/full_data/audios/"
// "344c2757c72360345bebcb71ce5c76d6.wav";
// std::string mfcc_out_dir = "./output.mfcc";
// std::string wav_dir = "./audios/";
// std::string feat_dir = "./features/";
// // processSingleFile(mfcc, wav_file, mfcc_out_dir);
// processFileList(mfcc, wav_dir, feat_dir);
return 0;
}
// Copyright (c) 2021 by Rockchip Electronics Co., Ltd. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/*-------------------------------------------
Functions
-------------------------------------------*/
static void printRKNNTensor(rknn_tensor_attr *attr) {
printf("index=%d name=%s n_dims=%d dims=[%d %d %d %d] n_elems=%d size=%d fmt=%d type=%d qnt_type=%d fl=%d zp=%d scale=%f\n",
attr->index, attr->name, attr->n_dims, attr->dims[3], attr->dims[2], attr->dims[1], attr->dims[0],
attr->n_elems, attr->size, 0, attr->type, attr->qnt_type, attr->fl, attr->zp, attr->scale);
}
static unsigned char *load_model(const char *filename, int *model_size)
{
FILE *fp = fopen(filename, "rb");
if(fp == nullptr) {
printf("fopen %s fail!\n", filename);
return NULL;
}
fseek(fp, 0, SEEK_END);
int model_len = ftell(fp);
unsigned char *model = (unsigned char*)malloc(model_len);
fseek(fp, 0, SEEK_SET);
if(model_len != fread(model, 1, model_len, fp)) {
printf("fread %s fail!\n", filename);
free(model);
return NULL;
}
*model_size = model_len;
if(fp) {
fclose(fp);
}
return model;
}