背景
在Windows上使用WebRTC做视频采集,然后使用RTMP进行直播推流。默认情况下WebRTC只会采集麦克风的声音,而不会采集机器的背景声音。需要编码实现背景声音的采集和混音功能。
思路
Windows提供的API中有音频采集的相关方法,官方也给出了简单的说明和示例,虽然不能运行:)。所以可以通过Windows的API来采集PCM格式的音频,然后通过WebRTC的群聊混音机制来进行音频合成
核心代码
音频采集部分
DWORD AudioCaptureCore::DoCaptureThread() { keepRecording_ = true; HANDLE waitArray[2] = { _hShutdownCaptureEvent, _hCaptureSamplesReadyEvent }; HRESULT hr = S_OK; LARGE_INTEGER t1; LARGE_INTEGER t2; int32_t time(0); BYTE* syncBuffer = NULL; UINT32 syncBufIndex = 0; _readSamples = 0; // Initialize COM as MTA in this thread. ScopedCOMInitializer comInit(ScopedCOMInitializer::kMTA); if (!comInit.succeeded()) { WEBRTC_TRACE(kTraceError, kTraceAudioDevice, _id, "failed to initialize COM in capture thread"); return 1; } hr = InitCaptureThreadPriority(); if (FAILED(hr)) { return hr; } _Lock(); REFERENCE_TIME hnsRequestedDuration = REFTIMES_PER_SEC; REFERENCE_TIME hnsActualDuration; UINT32 bufferLength; UINT32 numFramesAvailable; IMMDeviceEnumerator *pEnumerator = NULL; IMMDevice *pDevice = NULL; WAVEFORMATEX *pwfx = NULL; UINT32 packetLength = 0; BOOL bDone = FALSE; BYTE *pData; DWORD flags; hr = CoCreateInstance(CLSID_MMDeviceEnumerator, NULL, CLSCTX_ALL, IID_IMMDeviceEnumerator, (void**)&pEnumerator); EXIT_ON_ERROR(hr); hr = pEnumerator->GetDefaultAudioEndpoint(eRender, eConsole, &pDevice); EXIT_ON_ERROR(hr); hr = pDevice->Activate(IID_IAudioClient, CLSCTX_ALL, NULL, (void**)&_ptrAudioClientIn); EXIT_ON_ERROR(hr); // hr = _ptrAudioClientIn->GetMixFormat(&pwfx); EXIT_ON_ERROR(hr); WAVEFORMATEX waveFormat; waveFormat.wFormatTag = WAVE_FORMAT_PCM; waveFormat.nChannels = 2; waveFormat.nSamplesPerSec = pwfx->nSamplesPerSec; waveFormat.nAvgBytesPerSec = pwfx->nSamplesPerSec * 4; waveFormat.wBitsPerSample = 16; waveFormat.nBlockAlign = 4; waveFormat.cbSize = 0; *pwfx = waveFormat; hr = _ptrAudioClientIn->Initialize(AUDCLNT_SHAREMODE_SHARED, AUDCLNT_STREAMFLAGS_LOOPBACK, hnsRequestedDuration, 0, pwfx, NULL); EXIT_ON_ERROR(hr); // Set the VoE format equal to the AEC output format. _recAudioFrameSize = pwfx->nBlockAlign; _recSampleRate = pwfx->nSamplesPerSec; _recBlockSize = pwfx->nSamplesPerSec / 100; _recChannels = pwfx->nChannels; if (_ptrAudioBuffer) { // Update the audio buffer with the selected parameters _ptrAudioBuffer->SetRecordingSampleRate(_recSampleRate); _ptrAudioBuffer->SetRecordingChannels((uint8_t)_recChannels); } else { // We can enter this state during CoreAudioIsSupported() when no AudioDeviceImplementation // has been created, hence the AudioDeviceBuffer does not exist. // It is OK to end up here since we don't initiate any media in CoreAudioIsSupported(). WEBRTC_TRACE(kTraceInfo, kTraceAudioDevice, _id, "AudioDeviceBuffer must be attached before streaming can start"); } // Get the size of the allocated buffer. hr = _ptrAudioClientIn->GetBufferSize(&bufferLength); EXIT_ON_ERROR(hr); hr = _ptrAudioClientIn->GetService(__uuidof(IAudioCaptureClient), (void**)&_ptrCaptureClient); EXIT_ON_ERROR(hr); // Notify the audio sink which format to use. // 如上一行注释,以下的代码是将获取到的音频格式传给另外的类(自己定义的),同样的,因为 // 手动制定了音频格式,所以就不需要通知了 // hr = pMySink->SetFormat(pwfx); // EXIT_ON_ERROR(hr) // Calculate the actual duration of the allocated buffer. hnsActualDuration = (double)REFTIMES_PER_SEC * bufferLength / pwfx->nSamplesPerSec; //hr = _ptrAudioClientIn->Start(); // Start recording. //EXIT_ON_ERROR(hr); // Get size of capturing buffer (length is expressed as the number of audio frames the buffer can hold). // This value is fixed during the capturing session. // if (_ptrAudioClientIn == NULL) { WEBRTC_TRACE(kTraceError, kTraceAudioDevice, _id, "input state has been modified before capture loop starts."); return 1; } hr = _ptrAudioClientIn->GetBufferSize(&bufferLength); EXIT_ON_ERROR(hr); WEBRTC_TRACE(kTraceInfo, kTraceAudioDevice, _id, "[CAPT] size of buffer : %u", bufferLength); // Allocate memory for sync buffer. // It is used for compensation between native 44.1 and internal 44.0 and // for cases when the capture buffer is larger than 10ms. // const UINT32 syncBufferSize = 2 * (bufferLength * _recAudioFrameSize); syncBuffer = new BYTE[syncBufferSize]; if (syncBuffer == NULL) { return (DWORD)E_POINTER; } WEBRTC_TRACE(kTraceInfo, kTraceAudioDevice, _id, "[CAPT] size of sync buffer : %u [bytes]", syncBufferSize); // Get maximum latency for the current stream (will not change for the lifetime of the IAudioClient object). // REFERENCE_TIME latency; _ptrAudioClientIn->GetStreamLatency(&latency); WEBRTC_TRACE(kTraceInfo, kTraceAudioDevice, _id, "[CAPT] max stream latency : %u (%3.2f ms)", (DWORD)latency, (double)(latency / 10000.0)); // Get the length of the periodic interval separating successive processing passes by // the audio engine on the data in the endpoint buffer. // REFERENCE_TIME devPeriod = 0; REFERENCE_TIME devPeriodMin = 0; _ptrAudioClientIn->GetDevicePeriod(&devPeriod, &devPeriodMin); WEBRTC_TRACE(kTraceInfo, kTraceAudioDevice, _id, "[CAPT] device period : %u (%3.2f ms)", (DWORD)devPeriod, (double)(devPeriod / 10000.0)); double extraDelayMS = (double)((latency + devPeriod) / 10000.0); WEBRTC_TRACE(kTraceInfo, kTraceAudioDevice, _id, "[CAPT] extraDelayMS : %3.2f", extraDelayMS); double endpointBufferSizeMS = 10.0 * ((double)bufferLength / (double)_recBlockSize); WEBRTC_TRACE(kTraceInfo, kTraceAudioDevice, _id, "[CAPT] endpointBufferSizeMS : %3.2f", endpointBufferSizeMS); // Start up the capturing stream. // hr = _ptrAudioClientIn->Start(); EXIT_ON_ERROR(hr); _UnLock(); // Set event which will ensure that the calling thread modifies the recording state to true. // SetEvent(_hCaptureStartedEvent); // >> ---------------------------- THREAD LOOP ---------------------------- while (keepRecording_) { BYTE *pData = 0; UINT32 framesAvailable = 0; DWORD flags = 0; UINT64 recTime = 0; UINT64 recPos = 0; std::cout << "bgm audio capturing" << std::endl; _Lock(); // Sanity check to ensure that essential states are not modified // during the unlocked period. if (_ptrCaptureClient == NULL || _ptrAudioClientIn == NULL) { _UnLock(); WEBRTC_TRACE(kTraceCritical, kTraceAudioDevice, _id, "input state has been modified during unlocked period"); goto Exit; } // Find out how much capture data is available // hr = _ptrCaptureClient->GetBuffer(&pData, // packet which is ready to be read by used &framesAvailable, // #frames in the captured packet (can be zero) &flags, // support flags (check) &recPos, // device position of first audio frame in data packet &recTime); // value of performance counter at the time of recording the first audio frame if (SUCCEEDED(hr)) { if (AUDCLNT_S_BUFFER_EMPTY == hr) { // Buffer was empty => start waiting for a new capture notification event _UnLock(); continue; } if (flags & AUDCLNT_BUFFERFLAGS_SILENT) { // Treat all of the data in the packet as silence and ignore the actual data values. WEBRTC_TRACE(kTraceWarning, kTraceAudioDevice, _id, "AUDCLNT_BUFFERFLAGS_SILENT"); pData = NULL; } assert(framesAvailable != 0); if (pData) { CopyMemory(&syncBuffer[syncBufIndex*_recAudioFrameSize], pData, framesAvailable*_recAudioFrameSize); } else { ZeroMemory(&syncBuffer[syncBufIndex*_recAudioFrameSize], framesAvailable*_recAudioFrameSize); } assert(syncBufferSize >= (syncBufIndex*_recAudioFrameSize) + framesAvailable*_recAudioFrameSize); // Release the capture buffer // hr = _ptrCaptureClient->ReleaseBuffer(framesAvailable); EXIT_ON_ERROR(hr); _readSamples += framesAvailable; syncBufIndex += framesAvailable; QueryPerformanceCounter(&t1); // Get the current recording and playout delay. uint32_t sndCardRecDelay = (uint32_t) (((((UINT64)t1.QuadPart * _perfCounterFactor) - recTime) / 10000) + (10 * syncBufIndex) / _recBlockSize - 10); uint32_t sndCardPlayDelay = static_cast<uint32_t>(_sndCardPlayDelay); _sndCardRecDelay = sndCardRecDelay; while (syncBufIndex >= _recBlockSize) { if (_ptrAudioBuffer) { _ptrAudioBuffer->SetRecordedBuffer((const int8_t*)syncBuffer, _recBlockSize); _ptrAudioBuffer->SetVQEData(sndCardPlayDelay, sndCardRecDelay, 0); _ptrAudioBuffer->SetTypingStatus(KeyPressed()); QueryPerformanceCounter(&t1); // measure time: START _UnLock(); // release lock while making the callback _ptrAudioBuffer->DeliverRecordedData(); _Lock(); // restore the lock QueryPerformanceCounter(&t2); // measure time: STOP // Measure "average CPU load". // Basically what we do here is to measure how many percent of our 10ms period // is used for encoding and decoding. This value shuld be used as a warning indicator // only and not seen as an absolute value. Running at ~100% will lead to bad QoS. time = (int)(t2.QuadPart - t1.QuadPart); _avgCPULoad = (float)(_avgCPULoad*.99 + (time + _playAcc) / (double)(_perfCounterFreq.QuadPart)); _playAcc = 0; // Sanity check to ensure that essential states are not modified during the unlocked period if (_ptrCaptureClient == NULL || _ptrAudioClientIn == NULL) { _UnLock(); WEBRTC_TRACE(kTraceCritical, kTraceAudioDevice, _id, "input state has been modified during unlocked period"); goto Exit; } } // store remaining data which was not able to deliver as 10ms segment MoveMemory(&syncBuffer[0], &syncBuffer[_recBlockSize*_recAudioFrameSize], (syncBufIndex - _recBlockSize)*_recAudioFrameSize); syncBufIndex -= _recBlockSize; sndCardRecDelay -= 10; } if (_AGC) { uint32_t newMicLevel = _ptrAudioBuffer->NewMicLevel(); if (newMicLevel != 0) { // The VQE will only deliver non-zero microphone levels when a change is needed. // Set this new mic level (received from the observer as return value in the callback). WEBRTC_TRACE(kTraceStream, kTraceAudioDevice, _id, "AGC change of volume: new=%u", newMicLevel); // We store this outside of the audio buffer to avoid // having it overwritten by the getter thread. _newMicLevel = newMicLevel; SetEvent(_hSetCaptureVolumeEvent); } } } else { // If GetBuffer returns AUDCLNT_E_BUFFER_ERROR, the thread consuming the audio samples // must wait for the next processing pass. The client might benefit from keeping a count // of the failed GetBuffer calls. If GetBuffer returns this error repeatedly, the client // can start a new processing loop after shutting down the current client by calling // IAudioClient::Stop, IAudioClient::Reset, and releasing the audio client. WEBRTC_TRACE(kTraceError, kTraceAudioDevice, _id, "IAudioCaptureClient::GetBuffer returned AUDCLNT_E_BUFFER_ERROR, hr = 0x%08X", hr); goto Exit; } _UnLock(); } // ---------------------------- THREAD LOOP ---------------------------- << if (_ptrAudioClientIn) { hr = _ptrAudioClientIn->Stop(); } Exit: if (FAILED(hr)) { _ptrAudioClientIn->Stop(); _UnLock(); _TraceCOMError(hr); } RevertCaptureThreadPriority(); if (syncBuffer) { delete[] syncBuffer; } return (DWORD)hr; }
声音合成
创建 webrtc::AudioConferenceMixer *audio_mixer_ = nullptr; 在使用多路声音的时候进行混音
int32_t AnyRtmpCore::RecordedDataIsAvailable(const void* audioSamples, const size_t nSamples, const size_t nBytesPerSample, const size_t nChannels, const uint32_t samplesPerSec, const uint32_t totalDelayMS, const int32_t clockDrift, const uint32_t currentMicLevel, const bool keyPressed, uint32_t& newMicLevel) { std::cout << "[-----------] record data avaliable " << nSamples << nBytesPerSample << nChannels << samplesPerSec << std::endl; rtc::CritScope cs(&cs_audio_record_); if (microphone_enable_ && bgm_enable_) { audio_device_mixer_ptr_->RecordedDataIsAvailable(audioSamples, nSamples, nBytesPerSample, nChannels, samplesPerSec, totalDelayMS, clockDrift, currentMicLevel, keyPressed, newMicLevel); if (audio_mixer_) { audio_mixer_->Process(); } } else { // 当只有一种声音时,不进行混音 if (audio_record_callback_) { if (audio_record_sample_hz_ != samplesPerSec || nChannels != audio_record_channels_) { int16_t temp_output[kMaxDataSizeSamples]; int samples_per_channel_int = resampler_record_.Resample10Msec((int16_t*)audioSamples, samplesPerSec * nChannels, audio_record_sample_hz_ * audio_record_channels_, 1, kMaxDataSizeSamples, temp_output); audio_record_callback_->OnRecordAudio(temp_output, audio_record_sample_hz_ / 100, nBytesPerSample, audio_record_channels_, audio_record_sample_hz_, totalDelayMS); } else { audio_record_callback_->OnRecordAudio(audioSamples, nSamples, nBytesPerSample, audio_record_channels_, samplesPerSec, totalDelayMS); } } } return 0; }