基于Tensorflow 训练音频并导出RKNN 在RV1126上使用NPU 推导-锐单电子商城

(具体后续整理完善)

Tensorflow speech_cammd 训练自己的数据集

tensorflow 采用hash

# train python train.py --data_url="" \  --data_dir=data --wanted_words="words1,words2" \  # build pb  python freeze.py --start_checkpoint=/home/ysr/project/rknn/model/conv.ckpt-1200 \  --output_file=/home/ysr/project/rknn/model/my_frozen_graph.pb

RKNN-tools 导出RKNN

import numpy as np  import re import math import random #import cv2  from rknn.api import RKNN  if __name__ == '__main__':     # Create RKNN object     rknn = RKNN(verbose=False, verbose_file='./speech_command_build.log')         # Config for Model Input PreProcess     #rknn.config(quantized_dtype='dynamic_fixed_point-8')     #rknn.config(quantized_dtype='asymmetric_quantized-u8')     rknn.config(target_platform=['rv1126'])     # Load TensorFlow Model     print('--> Loading model')     rknn.load_tensorflow(tf_pb='./model/my_frozen_graph.pb',                          inputs=['Reshape'],                          outputs=['labels_softmax'],                          input_size_list=[[1,3920]])  # 40 x 98      print('done')         # Build Model     print('--> Building model')     rknn.build(do_quantization=False, dataset='./dataset.txt', pre_compile=False)     print('done')         # Export RKNN Model     #rknn.export_rknn('./speech_command_quantized.rknn')     rknn.export_rknn('./model/speech_command.rknn')           #import time     #time.sleep(100)

导出rknn 测试

from rknn.api import RKNN from tensorflow.python.ops import gen_audio_ops as contrib_audio import tensorflow as tf import numpy as np  wav_file = open("model/recoard.wav", "rb") wav_data = wav_file.read() #decoded_sample_data = audio_ops.decode_wav(wav_data, desired_channels=1, desired_samples=16000, name='decoded_sample_data') #spectrogram = audio_ops.audio_spectrogram(decoded_sample_data.audio, window_size=480, stride=160, magnitude_squared=True) #fingerprint_input = audio_ops.mfcc(decoded_sample_data, 16000,  dct_coefficient_count=40) # 40 取40 个点                                                                                 # shape = (1,98, 40)                                                                                 # 一维矩阵 40 个  decoded_sample_data = contrib_audio.decode_wav(wav_data, desired_channels=1, desired_samples=16000, name='decoded_sample_data') spectrogram = contrib_audio.audio_spectrogram(decoded_sample_data.audio, window_size=480, stride=160, magnitude_squared=True) fingerprint_input = contrib_audio.mfcc(spectrogram, 16000,  dct_coefficient_count=40)  print(fingerprint_input) fingerprint_input_npy = fingerprint_input.numpy() print(fingerprint_input_npy.size) #np.save('fingerprint_input.npy',fingerprint_input_npy) #np.savetxt('fingerprint_input.txt',fingerprint_input_npy) #print(fingerprint_input_npy) # Create RKNN object rknn = RKNN() # Load TensorFlow Model ret = rknn.load_rknn(path='model/speech_command.rknn')   print("rknn runtime start") ret = rknn.init_runtime(perf_debug=True)  #sdk_version = rknn.get_sdk_version() #print(sdk_version)    outputs, = rknn.inference(inputs=fingerprint_input_npy,data_type='float32') print("rknn runtime stop") #outputs = rknn.inference(inputs=[fingerprint_input_npy]) # Release RKNN Context rknn.release()  def load_labels(filename):     """Read in labels, one label per line."""     return [line.rstrip() for line in tf.io.gfile.GFile(filename)]  ## 后处理 labels = load_labels("model/conv_labels.txt") predictions = np.array(outputs) print(outputs) print(predictions) top_k = predictions[0].argsort()[-3][:-1] print(top_k) for node_id in top_k:     human_string = labels[node_id]     score = predictions[0][node_id]     print('%s (score = %.5f)' % (human_string, score))

Tensorflow 提取MFCC 算法和 Spectrogram

不依赖tensorflow.so

NPU 调用

/*  * @Author: your name  * @Date: 2021-08-02 17:58:26  * @LastEditTime: 2021-08-05 13:50:22  * @LastEditors: Please set LastEditors  * @Description: In User Settings Edit  * @FilePath: \deploy\tfversion\demo.cc  */ #include <cmath> #include <cstdint> #include <fstream> #include <iostream> #include <sstream> #include <vector> #include <iomanip> #include <chrono> #include <dirent.h> #include <cstring>  //#include "wav_header.h" #include "mfcc.h" #include "spectrogram.h" #include "NanoDet.hpp" #include "knn_api.h"

extern "C"
{
	//#include "rknn_inference.h"	
}

static void printRKNNTensor(rknn_tensor_attr *attr);
static unsigned char *load_model(const char *filename, int *model_size);
int rknn_start(float *data, uint32_t size);

/* Steps to calculate MFCC
Step1, load wav file prepare audio data
Step2, Spectrogram sgram;   sgram.Initialize(int window_length, int
step_length);  in spectrogram.cc window_length=window_size=480,
step_length=stride=160(tf.audio_spectrogram); use
ComputeSquaredMagnitudeSpectrogram(input, output), get the final spectrogram
results.
Step3, then use mfcc to compute mfcc features in mfcc.cc
      mfcc.Initialize(int input_length, double input_sample_rate),
input_length=input.size(),

*/
static const int16_t kint16min = static_cast<int16_t>(~0x7FFF);
static const int16_t kint16max = static_cast<int16_t>(0x7FFF);

typedef std::chrono::high_resolution_clock Clock;
typedef std::chrono::milliseconds Milliseconds;

inline float Int16SampleToFloat(int16_t data) {
    constexpr float kMultiplier = 1.0f / (1 << 15);
    return data * kMultiplier;
}

//inline int16_t FloatToInt16Sample(float data) {
//    constexpr float kMultiplier = 1.0f * (1 << 15);
//    return std::min<float>(
//        std::max<float>(roundf(data * kMultiplier), kint16min), kint16max);
//}

struct WAVHeader {
	/* RIFF Chunk Descriptor */
	uint8_t         RIFF[4];        // RIFF Header Magic header,4×Ö½Ú´ó¶ËÐò¡£ÎÄ¼þ´Ó´Ë´¦¿ªÊ¼£¬¶ÔÓÚWAV»òAVIÎÄ¼þ£¬ÆäÖµ×ÜÎª¡°RIFF¡±
	uint32_t        ChunkSize;      // RIFF Chunk Size,4×Ö½ÚÐ¡¶ËÐò¡£±íÊ¾ÎÄ¼þ×Ü×Ö½ÚÊý¼õ8£¬¼õÈ¥µÄ8×Ö½Ú±íÊ¾,ChunkIDÓëChunkSize±¾ÉíËùÕ¼×Ö½ÚÊý
	uint8_t         WAVE[4];        // WAVE Header,4×Ö½Ú´ó¶ËÐò¡£¶ÔÓÚWAVÎÄ¼þ£¬ÆäÖµ×ÜÎª¡°WAVE¡±
									/* "fmt" sub-chunk */
	uint8_t         fmt[4];         // FMT header, 4×Ö½Ú´ó¶ËÐò¡£ÆäÖµ×ÜÎª¡°fmt ¡±£¬±íÊ¾Format Chunk´Ó´Ë´¦¿ªÊ¼
	uint32_t        Subchunk1Size;  // Size of the fmt chunk,4×Ö½ÚÐ¡¶ËÐò¡£±íÊ¾Format ChunkµÄ×Ü×Ö½ÚÊý¼õ8
	uint16_t        AudioFormat;    // Audio format 1=PCM,6=mulaw,7=alaw,257=IBM Mu-Law, 258=IBM A-Law, 259=ADPCM,2×Ö½ÚÐ¡¶ËÐò
	uint16_t        NumOfChannels;      // Number of channels 1=Mono 2=Stereo,2×Ö½ÚÐ¡¶ËÐò
	uint32_t        SamplesPerSec;  // Sampling Frequency in Hz,4×Ö½ÚÐ¡¶ËÐò,±íÊ¾ÔÚÃ¿¸öÍ¨µÀÉÏÃ¿Ãë°üº¬¶àÉÙÖ¡
	uint32_t        bytesPerSec;    // bytes per second,4×Ö½ÚÐ¡¶ËÐò¡£´óÐ¡µÈÓÚSampleRate * BlockAlign£¬±íÊ¾Ã¿Ãë¹²°üº¬¶àÉÙ×Ö½Ú
	uint16_t        blockAlign;     // 2=16-bit mono, 4=16-bit stereo,2×Ö½ÚÐ¡¶ËÐò¡£´óÐ¡µÈÓÚNumChannels * BitsPerSample / 8£¬ ±íÊ¾Ã¿Ö¡µÄ¶àÍ¨µÀ×Ü×Ö½ÚÊý
	uint16_t        bitsPerSample;  // Number of bits per sample,2×Ö½ÚÐ¡¶ËÐò¡£±íÊ¾Ã¿Ö¡°üº¬¶àÉÙ±ÈÌØ
									/* "data" sub-chunk */
	uint8_t         Subchunk2ID[4]; // "data"  string,4×Ö½Ú´ó¶ËÐò¡£ÆäÖµ×ÜÎª¡°data¡±£¬±íÊ¾Data Chunk´Ó´Ë´¦¿ªÊ¼
	uint32_t        Subchunk2Size;  // Sampled data length, 4×Ö½ÚÐ¡¶ËÐò¡£±íÊ¾dataµÄ×Ü×Ö½ÚÊý


};

// Read audio data from wav file like tensorflow cc
size_t ReadWav(const std::string &filePath, std::vector<double> &data,
               uint32_t &decoded_sample_count, uint16_t &decoded_channel_count,
               uint32_t &decoded_sample_rate) {
    std::ifstream inFile(filePath, std::ifstream::in | std::ifstream::binary);
    size_t ret = 0;

    // read wav header and check infos
    WAVHeader hdr;
    int headerSize = sizeof(WAVHeader);
    inFile.read((char *)&hdr, headerSize);

    // Check audio format
    if (hdr.AudioFormat != 1 || hdr.bitsPerSample != 16) {
        std::cerr << "Unsupported audio format, use 16 bit PCM Wave"
                  << std::endl;
        return 1;
    }
    // Check sampling rate
    decoded_sample_rate = hdr.SamplesPerSec;
    if (hdr.SamplesPerSec != 16000) {
        std::cerr << "Sampling rate mismatch: Found " << hdr.SamplesPerSec
                  << " instead of " << 16000 << std::endl;
        return 1;
    }

    // Check sampling rate:
    decoded_channel_count = hdr.NumOfChannels;
    if (hdr.NumOfChannels != 1) {
        std::cerr << hdr.NumOfChannels
                  << " channel files are unsupported. Use mono." << std::endl;
        return 1;
    }

    if (!inFile.is_open()) {
        std::cout << std::endl << "Can not open the WAV file !!" << std::endl;
        return 1;
    }

    // read real audio data
    // calculate how many samples
    uint32_t expected_bytes = (hdr.bitsPerSample * hdr.NumOfChannels + 7) / 8;
    std::cout << "chunk_size: " << hdr.ChunkSize
              << "\t bytes_per_seconds: " << hdr.bytesPerSec
              << "\texpected bytes: " << expected_bytes
              << "bits_per_samples: " << hdr.bitsPerSample << std::endl;
    decoded_sample_count = hdr.ChunkSize / expected_bytes;
    // calculate how many data in audio
    uint32_t data_count = decoded_sample_count * hdr.NumOfChannels;
    std::vector<float> float_values;
    float_values.resize(data_count);
    std::cout << "Total samples in wav:" << data_count << std::endl;

    uint16_t bufferLength = data_count;
    int16_t *buffer = new int16_t[bufferLength];
    int bufferBPS = (sizeof buffer[0]);

    // Read all data into float_values
    inFile.read((char *)buffer, bufferLength * bufferBPS);
    for (int i = 0; i < bufferLength; i++)
        float_values[i] = Int16SampleToFloat(buffer[i]);
    // data[i] = Int16SampleToFloat(buffer[i]);
    delete[] buffer;

    //--------------------------------------
    inFile.close();

    // Convert from float to double for the output value.
    // TODO, here do audio clip the same as in data_process
    int clip_duration_ms = 1000; // only 1000ms
    int desired_samples = int(decoded_sample_rate * clip_duration_ms / 1000);
    // data.resize(float_values.size());
    data.resize(desired_samples);
    std::cout << "Choose process samples size was: " << desired_samples
              << std::endl;
    for (int i = 0; i < desired_samples; ++i) {
        if(i >= float_values.size()) {
            data[i] = 0.0;  // padding
        } else {
            data[i] = float_values[i];
        }
    }

    return 0;
}

// Convert vector of double to string (for writing MFCC file output)
std::string vector_to_string(std::vector<double> vec,
                             const std::string &delimiter) {
    std::stringstream vecStream;
    for (int i = 0; i < vec.size() - 1; i++) {
        vecStream << vec[i];
        vecStream << delimiter;
    }
    vecStream << vec.back();
    vecStream << "\n";
    return vecStream.str();
}

std::string vector_vector_string(std::vector<std::vector<double>> vec,
                                 const std::string &delimiter) {
    std::string s1 = "";
    std::stringstream vec_stream;
    for (int i = 0; i < vec.size() - 1; ++i) {
        s1 = vector_to_string(vec[i], delimiter);
        vec_stream << s1;
        vec_stream << "\n";
    }
    s1 = vector_to_string(vec.back(), delimiter);
    vec_stream << s1;
    // vec_stream << "\n";
    return vec_stream.str();
}

int ExtractMfccFeature(Mfcc &mfcc, Spectrogram &sgram,
                       std::vector<double> audio_samples,
                       uint32_t &sample_rate,
                       std::vector<std::vector<double>> &mfcc_features) {
    // Step1, convert audio data to spectrogram
    std::vector<std::vector<double>> spectrogram_output; 
    // *NOTE*, fft state
    sgram.Reset();   
    sgram.ComputeSquaredMagnitudeSpectrogram(audio_samples, &spectrogram_output);
    std::cout << "spectrogram size: " << spectrogram_output.size()
              << "\tinternal vector size: " << spectrogram_output[0].size()
              << std::endl;

    // std::string res2 = vector_vector_string(spectrogram_output, " ");
    // std::cout << "Specrogram results: \n" << res2 << std::endl;

    // Step2, calculate mfcc features with spectrogram
    // std::vector<std::vector<double>> output_data;
    // *NOTE*, here we only support 1-channel audio data
    int spectrogram_channels = spectrogram_output[0].size();
    mfcc.Initialize(spectrogram_channels, sample_rate);
    for (int i = 0; i < spectrogram_output.size(); ++i) {
        std::vector<double> mfcc_out;
        mfcc.Compute(spectrogram_output[i], &mfcc_out);
        // assert(mfcc_out.size() == dct_coefficient_count_);
        mfcc_features.push_back(mfcc_out);
    }

    // print results
    std::cout << "mfcc out total frames: " << mfcc_features.size()
              << " frame dimension: " << mfcc_features[0].size() << std::endl;

    // std::string mfcc_str = vector_vector_string(output_data, " ");
    // std::cout << "Mfcc feature results: \n" << mfcc_str << std::endl;

    return 0;   // success
}

/**
 * @description: String split with specific pattern
 * @param str: input string to be split
 * @param vec: split results
 * @param pattern: split delimiter
 * @return 
 */
void SplitWord(const std::string &str, std::vector<std::string>& vec, const std::string& pattern) {
    std::string::size_type pos1, pos2;
	pos1 = 0;
	pos2 = str.find(pattern);
	while (std::string::npos != pos2) {
		vec.push_back(str.substr(pos1, pos2 - pos1));
		pos1 = pos2 + pattern.size();
		pos2 = str.find(pattern, pos1);
	}
	if (pos1 != str.length()) {
		vec.push_back(str.substr(pos1));
	}
}

static int rknn_GetTop
    (   
    float *pfProb,
    float *pfMaxProb,
    uint32_t *pMaxClass,
    uint32_t outputCount,
    uint32_t topNum
    )   
{
	
    uint32_t i, j;

    #define MAX_TOP_NUM 20
    if (topNum > MAX_TOP_NUM) return 0;

    memset(pfMaxProb, 0, sizeof(float) * topNum);
    memset(pMaxClass, 0xff, sizeof(float) * topNum);

	printf("outputCount %d topNum %d \n", outputCount, topNum);
    for (j = 0; j < topNum; j++)
    {   
        for (i=0; i<outputCount; i++)
        {   
            if ((i == *(pMaxClass+0)) || (i == *(pMaxClass+1)) || (i == *(pMaxClass+2)) ||
                (i == *(pMaxClass+3)) || (i == *(pMaxClass+4)))
            {   
                continue;
            }   

            if (pfProb[i] > *(pfMaxProb+j))
            {   
                *(pfMaxProb+j) = pfProb[i];
                *(pMaxClass+j) = i;
            }   
        }   
    }   

    return 1;
}


int processSingleFile(Mfcc &mfcc, Spectrogram &sgram,std::string filename, std::string outfile) {
    std::cout << "Start extract audio features from file: " << filename
              << std::endl;
    Clock::time_point TStart, TEnd;
    TStart = Clock::now();
    std::vector<double> audio_samples;  // to store audio sample data, norm to -1.0~1.0
    uint32_t decoded_sample_count;  // how many samples in wav file
    uint16_t decoded_channel_count; // how many channels in wav file
    uint32_t decoded_sample_rate;   // the real sample rate of wav file

    size_t ret = ReadWav(filename, audio_samples, decoded_sample_count, decoded_channel_count,
            decoded_sample_rate);
    if (ret != 0) {
        std::cout << "Load audio data error!\n";
        return 1;
    }

    // Get Spectrogram and mfcc features
    std::vector<std::vector<double>> mfcc_features;
    ret = ExtractMfccFeature(mfcc, sgram,audio_samples, decoded_sample_rate,mfcc_features);


	//std::cout<<"mfcc size"<<mfcc_features.size() << "                 i" << mfcc_features[0].size()<<std::endl;
	
	//rknn_inference(0, mfcc_features.data());
	
	//std::cout<<"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"<<std::endl;

	//auto res = net.detect(mfcc_features);

	float data[3920] = {0};
	
	printf("mfcc_features.size() * mfcc_features[0].size() %d \n", mfcc_features.size() * mfcc_features[0].size());

    TEnd = Clock::now();
    Milliseconds ms = std::chrono::duration_cast<Milliseconds>(TEnd - TStart);
    std::cout << "Completed audio mfcc feature extraction cost time: "
              << ms.count() << "ms" << std::endl;

    // Save mfcc features to files
    std::ofstream outfs(outfile);
    if (!outfs.is_open()) {
        std::cout << "Open outfile " << outfile << "error!\n";
        return 1;
    }
	int count = 0;

    outfs << std::fixed << std::setprecision(8);
    for(int i=0; i < mfcc_features.size(); ++i) {
        for(int j = 0; j < mfcc_features[i].size(); ++j) {
            outfs << mfcc_features[i][j] << " ";
			data[count] = mfcc_features[i][j];
			
			count ++;
        }
        outfs << std::endl;
    }
	printf("count %d \n", count);
	rknn_start(data, mfcc_features.size() * mfcc_features[0].size());
	
    outfs.close();
    return 0;
}

int processFileList(Mfcc &mfcc, Spectrogram &sgram, std::string wavFolder, std::string outfolder) {
    DIR *pDir;
    struct dirent *ptr;
    std::vector<std::string> files;
    std::vector<std::string> outfiles;

    if (!(pDir = opendir(wavFolder.c_str()))) {
        perror(("Folder " + wavFolder + "doesn't exist!").c_str());
        return 1;
    }
    while ((ptr = readdir(pDir)) != 0) {
        if (strcmp(ptr->d_name, ".") != 0 && strcmp(ptr->d_name, "..") != 0) {
            // std::cout << ptr->d_name << std::endl;
            // extract label
            // std::vector<std::string> vec;
            // std::string delimiter = "_";
            // SplitWord(ptr->d_name, vec, delimiter);
            // if (vec.size() != 2) {
            //     std::cout << "wav file name not contain label: " << ptr->d_name
            //               << std::endl;
            // }

            // 5338ca0367ec5ef0d43244cdae31dda7.wav_2
            files.push_back(wavFolder + "/" + ptr->d_name);
            //mfcc.5338ca0367ec5ef0d43244cdae31dda7.wav_2
            outfiles.push_back(outfolder + "/" + "mfcc." + ptr->d_name);
        }
    }
    closedir(pDir);

    for (int i = 0; i < files.size(); ++i) {
        //processSingleFile(mfcc, sgram, files[i], outfiles[i]);
    }

    return 0;
}


int rknn_start(float *data, uint32_t size)
{
	int ret;
    rknn_context ctx;
	int model_len = 0;
    unsigned char *model;
	char* model_path = "./speech_command.rknn";
    // Load RKNN Model
    model = load_model(model_path, &model_len);
    ret = rknn_init(&ctx, model, model_len, 0);
    if(ret < 0) {
        printf("rknn_init fail! ret=%d\n", ret);
        return -1;
    }

    // Get Model Input Output Info
    rknn_input_output_num io_num;
    ret = rknn_query(ctx, RKNN_QUERY_IN_OUT_NUM, &io_num, sizeof(io_num));
    if (ret != RKNN_SUCC) {
        printf("rknn_query fail! ret=%d\n", ret);
        return -1;
    }
    printf("model input num: %d, output num: %d\n", io_num.n_input, io_num.n_output);

    printf("input tensors:\n");
    rknn_tensor_attr input_attrs[io_num.n_input];
    memset(input_attrs, 0, sizeof(input_attrs));
    for (int i = 0; i < io_num.n_input; i++) {
        input_attrs[i].index = i;
        ret = rknn_query(ctx, RKNN_QUERY_INPUT_ATTR, &(input_attrs[i]), sizeof(rknn_tensor_attr));
        if (ret != RKNN_SUCC) {
            printf("rknn_query fail! ret=%d\n", ret);
            return -1;
        }
        printRKNNTensor(&(input_attrs[i]));
    }

    printf("output tensors:\n");
    rknn_tensor_attr output_attrs[io_num.n_output];
    memset(output_attrs, 0, sizeof(output_attrs));
    for (int i = 0; i < io_num.n_output; i++) {
        output_attrs[i].index = i;
        ret = rknn_query(ctx, RKNN_QUERY_OUTPUT_ATTR, &(output_attrs[i]), sizeof(rknn_tensor_attr));
        if (ret != RKNN_SUCC) {
            printf("rknn_query fail! ret=%d\n", ret);
            return -1;
        }
        printRKNNTensor(&(output_attrs[i]));
    }


    // Set Input Data
    rknn_input inputs[1];
    memset(inputs, 0, sizeof(inputs));
    inputs[0].index = 0;
    //inputs[0].type = RKNN_TENSOR_UINT8;
    //inputs[0].fmt = RKNN_TENSOR_NHWC;
    inputs[0].type = RKNN_TENSOR_FLOAT32;
    inputs[0].size = size;
    inputs[0].fmt = RKNN_TENSOR_NHWC;
    inputs[0].buf = data;


    ret = rknn_inputs_set(ctx, io_num.n_input, inputs);
    if(ret < 0) {
        printf("rknn_input_set fail! ret=%d\n", ret);
        return -1;
    }

    // Run
    printf("rknn_run\n");
    ret = rknn_run(ctx, nullptr);
    if(ret < 0) {
        printf("rknn_run fail! ret=%d\n", ret);
        return -1;
    }

    // Get Output
    rknn_output outputs[1];
    memset(outputs, 0, sizeof(outputs));
    outputs[0].want_float = 1;
    ret = rknn_outputs_get(ctx, 1, outputs, NULL);
    if(ret < 0) {
        printf("rknn_outputs_get fail! ret=%d\n", ret);
        return -1;
    }

    // Post Process
    for (int i = 0; i < io_num.n_output; i++)
    {
		uint32_t MaxClass[5];
		float fMaxProb[5];
		float *buffer = (float *)outputs[i].buf;
		uint32_t sz = outputs[i].size/4;
		printf("outputs[%d].size %d  index %d \n",i, outputs[i].size, outputs[i].index);

#if 0
		for(int i =0 ; i< sz; i++)
		{
			//const int idx_class = i;
			
			//fMaxProb[i] = buffer[i];
			printf("class %d fMaxProbe %8.6f \n", i, buffer[i]);

		}
#endif
 

		rknn_GetTop(buffer, fMaxProb, MaxClass, sz, 5);
		printf(" --- Top5 ---\n");
		for(int i=0; i<5; i++)
		{
			printf("%3d: %8.6f\n", MaxClass[i], fMaxProb[i]);
		}
	}

    // Release rknn_outputs
    rknn_outputs_release(ctx, 1, outputs);

    // Release
    if(ctx >= 0) {
        rknn_destroy(ctx);
    }
    if(model) {
        free(model);
    }
    return 0;
}


int main() {

    // ----------------Parameters for Spectrogram and MFCC-----------------//
    int sample_rate = 16000;    // sample rate,default=16000, only support 16khz
    int window_size_ms = 30;    // default 30ms, keep the same as model train
    int window_size_samples = int(sample_rate * window_size_ms / 1000); // 480
    int window_stride_ms = 10;                                          // 10ms
    int window_stride_samples =
        int(sample_rate * window_stride_ms / 1000); // 160


	// init rknn
	//rknn_mode_load("./speech_command.rknn");

	//NanoDet net(model_path);
	//net.Input_Output_Configuration();
		

    // Define Spetrogram instance 
    Spectrogram sgram;
    sgram.Initialize(window_size_samples, window_stride_samples);

    // Define Mfcc instance
    Mfcc mfcc;
    // set parameters for mfcc
    // Defaults to `20`.The lowest frequency to use when calculating the
    // ceptstrum.
    double lower_frequency_limit_ = 20; // 20hz
    // 4000hz, Defaults to `4000` The highest frequency to use when calculating
    // the ceptstrum
    double upper_frequency_limit_ = 4000;
    // Defaults to `40`.Resolution of the Mel bank used internally.
    int filterbank_channel_count_ = 40;
    // Defaults to `13`.How many output channels to produce per time slice.
    int dct_coefficient_count_ = 40;
    mfcc.set_upper_frequency_limit(upper_frequency_limit_);
    mfcc.set_lower_frequency_limit(lower_frequency_limit_);
    mfcc.set_filterbank_channel_count(filterbank_channel_count_);
    mfcc.set_dct_coefficient_count(dct_coefficient_count_);
    // TODO, this was calculate by sgram.ComputeSquaredMagnitudeSpectrogram
    const int spectrogram_channels = 257; 
    
    // const int spectrogram_samples = spectrogram_output.size();
    // mfcc.Initialize(spectrogram_channels, sample_rate);



    std::string wav_file = "./recoard.wav";
    std::string mfcc_out_dir = "./feat.mfcc";

    processSingleFile(mfcc, sgram,wav_file, mfcc_out_dir);

    // process wav list
    //std::string wav_dir = "../audios/";
    //std::string feat_dir = "../features/";
    //processFileList(mfcc, sgram, wav_dir, feat_dir);

    // std::vector<double> vec;
    // uint32_t decoded_sample_count;
    // uint16_t decoded_channel_count;
    // uint32_t decoded_sample_rate;
    // ReadWav(wav_file, vec, decoded_sample_count, decoded_channel_count,
    //         decoded_sample_rate);
    // std::string delimiter = " ";
    // std::string res = vector_to_string(vec, delimiter);
    // std::cout << "Audio data samples size: " << vec.size() << std::endl;
    // std::cout << res << std::endl;

    // Step2, auido data convert to spectrogram



    // std::vector<std::vector<double>> spectrogram_output;
    // sgram.ComputeSquaredMagnitudeSpectrogram(vec, &spectrogram_output);
    // std::cout << "spectrogram size: " << spectrogram_output.size()
    //           << "\tinternal vector size: " << spectrogram_output[0].size()
    //           << std::endl;



    // Step3, create mfcc features, we implementation as the tensorflow in
    // mfcc_op.cc get spectrogram_channels, the dimension of spectrogram
    // spectrogram_samples, how many spectrograms
    // audio_channels, audio channels, default=1

    // const int spectrogram_channels = spectrogram_output[0].size();
    // const int spectrogram_samples = spectrogram_output.size();
    // const int audio_channels = 1;
    // std::cout << "spectrogram_channels: " << spectrogram_channels
    //           << "\tspectrogram_samples: " << spectrogram_samples
    //           << "\taudio_channels: " << audio_channels << std::endl;
    // Mfcc mfcc;
    // // set parameters for mfcc
    // // Defaults to `20`.The lowest frequency to use when calculating the
    // // ceptstrum.
    // double lower_frequency_limit_ = 20; // 20hz
    // // 4000hz, Defaults to `4000` The highest frequency to use when calculating
    // // the ceptstrum
    // double upper_frequency_limit_ = 4000;
    // // Defaults to `40`.Resolution of the Mel bank used internally.
    // int filterbank_channel_count_ = 40;
    // // Defaults to `13`.How many output channels to produce per time slice.
    // int dct_coefficient_count_ = 40;
    // mfcc.set_upper_frequency_limit(upper_frequency_limit_);
    // mfcc.set_lower_frequency_limit(lower_frequency_limit_);
    // mfcc.set_filterbank_channel_count(filterbank_channel_count_);
    // mfcc.set_dct_coefficient_count(dct_coefficient_count_);
    // mfcc.Initialize(spectrogram_channels, sample_rate);

    // // define output for mfcc
    // std::vector<std::vector<double>> output_data;
    // // *NOTE*, here we only support 1-channel audio data
    // for (int i = 0; i < spectrogram_output.size(); ++i) {
    //     std::vector<double> mfcc_out;
    //     mfcc.Compute(spectrogram_output[i], &mfcc_out);
    //     assert(mfcc_out.size() == dct_coefficient_count_);
    //     output_data.push_back(mfcc_out);
    // }




    // std::string wav_file = "/data1/yw.shi/data/audio/xiaoshun/full_data/audios/"
    //                        "344c2757c72360345bebcb71ce5c76d6.wav";
    // std::string mfcc_out_dir = "./output.mfcc";
    // std::string wav_dir = "./audios/";
    // std::string feat_dir = "./features/";

    // // processSingleFile(mfcc, wav_file, mfcc_out_dir);

    // processFileList(mfcc, wav_dir, feat_dir);
    return 0;
}
// Copyright (c) 2021 by Rockchip Electronics Co., Ltd. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.


/*-------------------------------------------
                  Functions
-------------------------------------------*/

static void printRKNNTensor(rknn_tensor_attr *attr) {
    printf("index=%d name=%s n_dims=%d dims=[%d %d %d %d] n_elems=%d size=%d fmt=%d type=%d qnt_type=%d fl=%d zp=%d scale=%f\n", 
            attr->index, attr->name, attr->n_dims, attr->dims[3], attr->dims[2], attr->dims[1], attr->dims[0], 
            attr->n_elems, attr->size, 0, attr->type, attr->qnt_type, attr->fl, attr->zp, attr->scale);
}

static unsigned char *load_model(const char *filename, int *model_size)
{
    FILE *fp = fopen(filename, "rb");
    if(fp == nullptr) {
        printf("fopen %s fail!\n", filename);
        return NULL;
    }
    fseek(fp, 0, SEEK_END);
    int model_len = ftell(fp);
    unsigned char *model = (unsigned char*)malloc(model_len);
    fseek(fp, 0, SEEK_SET);
    if(model_len != fread(model, 1, model_len, fp)) {
        printf("fread %s fail!\n", filename);
        free(model);
        return NULL;
    }
    *model_size = model_len;
    if(fp) {
        fclose(fp);
    }
    return model;
}

资讯详情

基于Tensorflow 训练音频并导出RKNN 在RV1126上使用NPU 推导

Tensorflow speech_cammd 训练自己的数据集

RKNN-tools 导出RKNN

导出rknn 测试

Tensorflow 提取MFCC 算法和 Spectrogram

NPU 调用

动力学技术KTU1121 USB Type-C 端口保护器的介绍、特性、及应用

基于Tensorflow 训练音频 并导出RKNN 在RV1126上使用NPU 推导

Tensorflow speech_cammd 训练自己的数据集

RKNN-tools 导出RKNN

导出rknn 测试

Tensorflow 提取MFCC 算法和 Spectrogram

NPU 调用

动力学技术KTU1121 USB Type-C 端口保护器的介绍、特性、及应用

最近热搜

历史搜索 清除历史记录

基于Tensorflow 训练音频并导出RKNN 在RV1126上使用NPU 推导

历史搜索清除历史记录