因为要学习做WCDMA的流程解析,需要先提取卷积数据,首先就要做FEC卷积译码。
于是网上翻了好大一圈,特地学习了下viterbi译码算法,费很大力气才凑齐能够正确跑起来的代码,特记录一下。
说点题外话:viterbi是个人,全名Andrew J. Viterbi,一枚数学家,美国高通公司的创始人之一(没错,就是现在手机上那个高通芯片的高通),现代无线数字通讯工程的奠基人。
此外,viterbi算法不但可用来做卷积译码,另外一个广泛的应用是做股票证券量化预测计算(隐马模型预测)。没错,就是高频量化交易炒股的算法基础。美国量化对冲基金的传奇,大奖章基金背后的老板James Simons,首创将隐马模型预测用于炒股,也是一枚数学家。
两枚玩数学算法的巨富,谁说学数学算法没用的?
FEC前向纠错译码用的viterbi算法代码摘抄自libfec,里面的实现很全面,还有mmx,sse加速版。
测试是做WCDMA,所以解交织和译码用的3GPP WCDMA的卷积编码参数(一帧编码长度262bits,卷积编码多项式是 r=1/2 k=9),我就只摘取了需要的viterbi29部分,有需要全部实现的同学可以自行去下载libfec。
下面是写来用于测试验证算法正确性的线程代码。从IQ文件中一次读取2个float数,读取262+8(一帧viterbi译码长度)个数据后进行译码,若译码成功程序停在对应的测试断点上。
UINT CALLBACK ProcessIQDataThread(void *p_param)
{
Convolution2Viterbi_t *p_sys = (Convolution2Viterbi_t *)p_param;
vector<double> *p_data = NULL;
int framebits = 262;
struct v29 *vp;
vector<float> uninter1;
vector<float> uninter2;
vector<float> radioFrame;
vector<float> frameL;
vector<float> frameR;
vector<uint8_t> viterbi_in;
uint8_t viterbi_out[33];
uninter2.resize((framebits + 8));
uninter1.resize((framebits + 8) * 2);
viterbi_in.resize((framebits + 8) * 2);
if ((vp = (struct v29 *)create_viterbi29_port(framebits)) == NULL) {
return -1;
}
while (!p_sys->exit_thread)
{
if (!p_data)
{
WaitForSingleObject(p_sys->iq_data_event, 200);
p_sys->iq_data_list.Lock();
p_data = p_sys->iq_data_list.GetUse();
p_sys->iq_data_list.UnLock();
}
if (!p_data)
continue;
// 读取2个浮点数
for (int i = 0; i < p_data->size() / 2; i++)
{
frameL.push_back(p_data->at(0));
frameR.push_back(p_data->at(1));
}
// 一帧译码长度
if (frameL.size() >= (framebits + 8))
{
// 解交织inter2
deInterleavingNP(30, inter2Perm, frameL, uninter2);
radioFrame.insert(radioFrame.end(), uninter2.begin(), uninter2.end());
deInterleavingNP(30, inter2Perm, frameR, uninter2);
radioFrame.insert(radioFrame.end(), uninter2.begin(), uninter2.end());
if (radioFrame.size() >= (framebits + 8) * 2)
{
// 解交织inter1
deInterleavingNP(inter1Columns[1], inter1Perm[1], radioFrame, uninter1);
radioFrame.clear();
// 使用Viterbi算法做FEC译码
for (int i = 0; i < (framebits + 8) * 2; i++)
{
viterbi_in.push_back( ((int)uninter1[i] >> 24) );
}
init_viterbi29_port(vp, 0);
update_viterbi29_blk_port(vp, viterbi_in.data(), framebits + 8);
chainback_viterbi29_port(vp, viterbi_out, framebits, 0);
#ifdef _DEBUG
// 数据验证
for (int i = 0; i < 33 - 4; i++) {
if (viterbi_out[i] == 0x98 && viterbi_out[i + 1] == 0x54 && viterbi_out[i + 2] == 0xA0 && viterbi_out[i + 3] == 0x00)
int bbb = 0;
if (viterbi_out[i] == 0x05 && viterbi_out[i + 1] == 0x33 && viterbi_out[i + 2] == 0x00 && viterbi_out[i + 3] == 0x0c)
int bbb = 0;
if (viterbi_out[i] == 0x00 && viterbi_out[i + 1] == 0x03 && viterbi_out[i + 2] == 0xf4 && viterbi_out[i + 3] == 0x40)
int bbb = 0;
if (viterbi_out[i] == 0xc5 && viterbi_out[i + 1] == 0x80 && viterbi_out[i + 2] == 0x05 && viterbi_out[i + 3] == 0x5a)
int bbb = 0;
if (viterbi_out[i] == 0xe4 && viterbi_out[i + 1] == 0x00 && viterbi_out[i + 2] == 0x33 && viterbi_out[i + 3] == 0xe6)
int bbb = 0;
if (viterbi_out[i] == 0x72 && viterbi_out[i + 1] == 0x08 && viterbi_out[i + 2] == 0x38)
int bbb = 0;
if (viterbi_out[i] == 0xe2 && viterbi_out[i + 1] == 0x00 && viterbi_out[i + 2] == 0x38)
int bbb = 0;
}
#endif
}
frameL.clear();
frameR.clear();
}
p_sys->iq_data_list.Lock();
p_sys->iq_data_list.PushEmpty(p_data);
p_data = p_sys->iq_data_list.GetUse();
p_sys->iq_data_list.UnLock();
}
return 0;
}
解交织 Interleaving.h
#pragma once
#include <vector>
#include <assert.h>
const int inter1Columns[4] = {
1, // TTI = 10ms
2, // TTI = 20ms
4, // TTI = 40ms
8 // TTI = 80ms
};
// 25.212 4.2.5.2 table 4: Inter-Column permutation pattern for 1st interleaving:
const char inter1Perm[4][8] = {
{0}, // TTI = 10ms
{0, 1}, // TTI = 20ms
{0, 2, 1, 3}, // TTI = 40ms
{0, 4, 2, 6, 1, 5, 3, 7} // TTI = 80ms
};
// 25.212 4.2.11 table 7: Inter-Column permutation pattern for 2nd interleaving:
const char inter2Perm[30] = { 0,20,10,5,15,25,3,13,23,8,18,28,1,11,21,
6,16,26,4,14,24,19,9,29,12,2,7,22,27,17 };
/* FIRST steps two pointers through a mapping, one pointer into the interleaved
* data and the other through the uninterleaved data. The fifth argument, COPY,
* determines whether the copy is from interleaved to uninterleaved, or back.
* FIRST assumes no padding is necessary.
* The reason for the define is to minimize the cost of parameterization and
* function calls, as this is meant for L1 code, while also minimizing the
* duplication of code.
*/
#define FIRST(UNINTERLEAVED,UNINTERLEAVEDP,INTERLEAVED,INTERLEAVEDP,COPY) \
assert(UNINTERLEAVED.size() == INTERLEAVED.size()); \
unsigned int rows = UNINTERLEAVED.size() / columns; \
assert(rows * columns == UNINTERLEAVED.size()); \
const char *colp = permutation; \
float *INTERLEAVEDP = &INTERLEAVED[0]; \
for (unsigned i = 0; i < columns; i++) { \
float *UNINTERLEAVEDP = &UNINTERLEAVED[*colp++]; \
for (unsigned int j = 0; j < rows; j++) { \
COPY; \
UNINTERLEAVEDP += columns; \
} \
}
/** interleaving with No Padding */
void interleavingNP(const unsigned int columns, const char *permutation, vector<float> &in, vector<float> &out)
{
FIRST(in, inp, out, outp, *outp++ = *inp)
}
/** de-interleaving with No Padding */
void deInterleavingNP(const unsigned int columns, const char *permutation, vector<float> &in, vector<float> &out)
{
FIRST(out, outp, in, inp, *outp = *inp++)
}
/* SECOND steps two pointers through a mapping, one pointer into the interleaved
* data and the other through the uninterleaved data. The fifth argument, COPY,
* determines whether the copy is from interleaved to uninterleaved, or back.
* SECOND pads if necessary.
* The reason for the define is to minimize the cost of parameterization and
* function calls, as this is meant for L1 code, while also minimizing the
* duplication of code.
*/
#define SECOND(UNINTERLEAVED,UNINTERLEAVEDP,INTERLEAVED,INTERLEAVEDP,COPY) \
assert(UNINTERLEAVED.size() == INTERLEAVED.size()); \
int R2 = (UNINTERLEAVED.size() + columns - 1) / columns; \
int padding = columns * R2 - UNINTERLEAVED.size(); \
int rows = R2; \
int firstPaddedColumn = columns - padding; \
const char *colp = permutation; \
float *UNINTERLEAVEDP = &UNINTERLEAVED[0]; \
for (int i = 0; i < columns; i++) { \
int trows = rows - (*colp >= firstPaddedColumn); \
float *INTERLEAVEDP = &INTERLEAVED[*colp++]; \
for (int j = 0; j < trows; j++) { \
COPY; \
INTERLEAVEDP += columns; \
} \
}
/** interleaving With Padding */
void interleavingWP(const int columns, const char *permutation, vector<float> &in, vector<float> &out)
{
SECOND(in, inp, out, outp, *outp = *inp++)
}
/** de-interleaving With Padding */
void deInterleavingWP(const int columns, const char *permutation, vector<float> &in, vector<float> &out)
{
SECOND(out, outp, in, inp, *outp++ = *inp)
}
/*
Determining the constants.
From the standard we know:
* All frame sizes for the BCH.
* transport block is 246 bits
* there are two radio frames, 270 bits each
* TTI is 20 ms
* SF is 256
* parity word Li is 16 bits
* For all downlink TrCH except BCH, the radio frame size is 2*38400/SF = 76800/SF.
* For SF=256 that's 300.
* For SF=4 that's 19200.
* The maximum code block size for convulutional coding is 540 bits (25.212 4.2.2.2).
* That corresponds to a radio frame size of 1080 bits, or a spreading factor of 71,
meaning that the smallest spreading factor that can be used is 128.
* 76800/128 = 600 coded bits -> roughly 300 data bits.
* That corresponds to an input rate of roughly 30 kb/s at.
* The maximum code block size for turbo coding is 5114 bits (25.212 4.2.2.2).
* That corresponds to a radio frame size of 15342 bits, or a spreading factor of 5,
meaning that the smallest spreading factor that can be used is 8.
* 76800/8 = 9600 coded bits -> roughly 3200 data bits.
* That corresponds to an input rate of roughly 320 kb/s.
OK - SO HOW DO YOU GET HIGHER RATES?? HOW CAN YOU USE SF=4??
A: Use the full 5114-but code block and then expand it with rate-matching.
You still can't get the full ~640 kb/s implied by SF=4, but you get to ~500 kb/s.
(pat) A: They considered this problem. See 25.212 4.2.2.2 Code block segmentation.
In Layer1, after transport block concatenation, you then simply chop the result up
into the largest pieces that can go through the encoder, then put them back together after.
From "higher layers" we are given:
* SF: 4, 8, 16, 32, 64, 128, 256.
* P: 24, 16, 8, 0 bits
* TTI: 10, 20, 40 ms.
To simplify things, we set:
* TTI 10 ms always on DCH and FACH, 20 ms on PCH and BCH
* BCH and PCH are always rate-1/2 convolutional code
* DCH and FACH are always rate-1/3 turbo code
* no rate-matching, no puncturing
* parity word is always 16 bits
* So the only parameter than changes is spreading factor.
* We will only support 15-slot (non-compressed) slot formats.
From our simplifications we also know:
* For non-BCH/PCH TrCH there is one radio frame,
76800/SF channel (coded) bits, per transport block.
* DCH and FACH always use rate-1/3 turbo code,
which has 12 terminating bits in the output.
* For DCH and FACH, the transport block size is
((76800/SF - 12)/3) - P = (25600/SF) - 4 - P data bits,
where P is the parity word size.
* Fix P=16 for simplicity and transport block size is (25600/SF) - 20.
* for SF=256, that's 80 bits.
* for SF=16, that's 1580 bits.
* for SF=8, that's 3180 bits.
* For PCH there is one radio frame,
76800/SF channel (coded) bits, per transport block.
* SF=64, for that's 1200 channel bits.
* It's a rate-1/2 conv code, so that's 1200/2 - 8 - P data bits.
* P=16 so that's 1200/2 - 24 = 576 transport bits. Really?
*/
#if 0
const int inter1Columns[] = { 1, 2, 4, 8 };
const char inter1Perm[4][8] = {
{0},
{0, 1},
{0, 2, 1, 3},
{0, 4, 2, 6, 1, 5, 3, 7}
};
const char inter2Perm[] = {
0, 20, 10, 5, 15, 25, 3, 13, 23, 8, 18, 28, 1, 11, 21,
6, 16, 26, 4, 14, 24, 19, 9, 29, 12, 2, 7, 22, 27, 17
};
vector<char> randomBitVector(int n)
{
vector<char> t(n);
for (int i = 0; i < n; i++) t[i] = random() % 2;
return t;
}
void testInterleavings()
{
int lth1 = 48;
int C2 = 30;
for (int i = 0; i < 4; i++) {
vector<char> v1 = randomBitVector(lth1);
vector<char> v2(lth1);
vector<char> v3(lth1);
v1.interleavingNP(inter1Columns[i], inter1Perm[i], v2);
v2.deInterleavingNP(inter1Columns[i], inter1Perm[i], v3);
cout << "first " << i << " " << (veq(v1, v3) ? "ok" : "fail") << endl;
}
for (int lth2 = 90; lth2 < 120; lth2++) {
vector<char> v1 = randomBitVector(lth2);
vector<char> v2(lth2);
vector<char> v3(lth2);
v1.interleavingWP(C2, inter2Perm, v2);
v2.deInterleavingWP(C2, inter2Perm, v3);
cout << "second " << lth2 << " " << (veq(v1, v3) ? "ok" : "fail") << endl;
}
for (int lth = 48; lth <= 4800; lth *= 10) {
TurboInterleaver er(lth);
cout << "Turbo Interleaver permutation(" << lth << ") " << (permutationCheck(er.permutation()) ? "ok" : "fail") << endl;
vector<char> er1 = randomBitVector(lth);
vector<char> er2(lth);
er.interleave(er1, er2);
vector<char> er3(lth);
er.unInterleave(er2, er3);
cout << "Turbo Interleaver(" << lth << ") " << (veq(er1.sliced(), er3) ? "ok" : "fail") << endl;
}
}
#endif
viterbi译码
partab实现 partab.c
/* Utility routines for FEC support
* Copyright 2004, Phil Karn, KA9Q
*/
#include <stdio.h>
#include "fec.h"
unsigned char Partab[256] = {0};
int P_init = 0;
/* Create 256-entry odd-parity lookup table
* Needed only on non-ia32 machines
*/
void partab_init(void) {
int i, cnt, ti;
/* Initialize parity lookup table */
for (i = 0; i < 256; i++) {
cnt = 0;
ti = i;
while (ti) {
if (ti & 1)
cnt++;
ti >>= 1;
}
Partab[i] = cnt & 1;
}
P_init = 1;
}
/* Lookup table giving count of 1 bits for integers 0-255 */
int Bitcnt[] = {
0, 1, 1, 2, 1, 2, 2, 3,
1, 2, 2, 3, 2, 3, 3, 4,
1, 2, 2, 3, 2, 3, 3, 4,
2, 3, 3, 4, 3, 4, 4, 5,
1, 2, 2, 3, 2, 3, 3, 4,
2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5,
3, 4, 4, 5, 4, 5, 5, 6,
1, 2, 2, 3, 2, 3, 3, 4,
2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5,
3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5,
3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6,
4, 5, 5, 6, 5, 6, 6, 7,
1, 2, 2, 3, 2, 3, 3, 4,
2, 3, 3, 4, 3, 4, 4, 5,
2, 3, 3, 4, 3, 4, 4, 5,
3, 4, 4, 5, 4, 5, 5, 6,
2, 3, 3, 4, 3, 4, 4, 5,
3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6,
4, 5, 5, 6, 5, 6, 6, 7,
2, 3, 3, 4, 3, 4, 4, 5,
3, 4, 4, 5, 4, 5, 5, 6,
3, 4, 4, 5, 4, 5, 5, 6,
4, 5, 5, 6, 5, 6, 6, 7,
3, 4, 4, 5, 4, 5, 5, 6,
4, 5, 5, 6, 5, 6, 6, 7,
4, 5, 5, 6, 5, 6, 6, 7,
5, 6, 6, 7, 6, 7, 7, 8,
};
fec.h
/* User include file for libfec
* Copyright 2004, Phil Karn, KA9Q
* May be used under the terms of the GNU Lesser General Public License (LGPL)
*/
#ifndef _FEC_H_
#define _FEC_H_
#ifdef __cplusplus
extern "C" {
#endif
/* r=1/2 k=9 convolutional encoder polynomials */
#define V29POLYA 0x1af
#define V29POLYB 0x11d
void *create_viterbi29_port(int len);
void set_viterbi29_polynomial_port(int polys[2]);
int init_viterbi29_port(void *p, int starting_state);
int chainback_viterbi29_port(void *p, unsigned char *data, unsigned int nbits, unsigned int endstate);
void delete_viterbi29_port(void *p);
int update_viterbi29_blk_port(void *p, unsigned char *syms, int nbits);
void partab_init();
static inline int parityb(unsigned char x) {
extern unsigned char Partab[256];
extern int P_init;
if (!P_init) {
partab_init();
}
return Partab[x];
}
static inline int parity(int x) {
/* Fold down to one byte */
x ^= (x >> 16);
x ^= (x >> 8);
return parityb(x);
}
#ifdef __cplusplus
}
#endif
#endif /* _FEC_H_ */
viterbi29_port.c
/* K=9 r=1/2 Viterbi decoder in portable C
* Copyright Feb 2004, Phil Karn, KA9Q
* May be used under the terms of the GNU Lesser General Public License (LGPL)
*/
#include <stdio.h>
#include <stdlib.h>
#include <memory.h>
#include "fec.h"
typedef union { unsigned int w[256]; } metric_t;
typedef union { unsigned long w[8]; } decision_t;
static union { unsigned char c[128]; } Branchtab29[2];
static int Init = 0;
/* State info for instance of Viterbi decoder */
struct v29 {
metric_t metrics1; /* path metric buffer 1 */
metric_t metrics2; /* path metric buffer 2 */
decision_t* dp; /* Pointer to current decision */
metric_t* old_metrics, * new_metrics; /* Pointers to path metrics, swapped on every bit */
decision_t* decisions; /* Beginning of decisions for block */
};
/* Initialize Viterbi decoder for start of new frame */
int init_viterbi29_port(void* p, int starting_state) {
struct v29* vp = p;
int i;
if (p == NULL)
return -1;
for (i = 0; i < 256; i++)
vp->metrics1.w[i] = 63;
vp->old_metrics = &vp->metrics1;
vp->new_metrics = &vp->metrics2;
vp->dp = vp->decisions;
vp->old_metrics->w[starting_state & 255] = 0; /* Bias known start state */
return 0;
}
void set_viterbi29_polynomial_port(int polys[2]) {
int state;
for (state = 0; state < 128; state++) {
Branchtab29[0].c[state] = (polys[0] < 0) ^ parity((2 * state) & abs(polys[0])) ? 255 : 0;
Branchtab29[1].c[state] = (polys[1] < 0) ^ parity((2 * state) & abs(polys[1])) ? 255 : 0;
}
Init++;
}
/* Create a new instance of a Viterbi decoder */
void* create_viterbi29_port(int len) {
struct v29* vp;
if (!Init) {
int polys[2] = { V29POLYA,V29POLYB };
set_viterbi29_polynomial_port(polys);
}
if ((vp = (struct v29*)malloc(sizeof(struct v29))) == NULL)
return NULL;
if ((vp->decisions = (decision_t*)malloc((len + 8) * sizeof(decision_t))) == NULL) {
free(vp);
return NULL;
}
init_viterbi29_port(vp, 0);
return vp;
}
/* Viterbi chainback */
int chainback_viterbi29_port(
void* p,
unsigned char* data, /* Decoded output data */
unsigned int nbits, /* Number of data bits */
unsigned int endstate) /* Terminal encoder state */
{
struct v29* vp = p;
decision_t* d;
if (p == NULL)
return -1;
d = vp->decisions;
/* Make room beyond the end of the encoder register so we can
* accumulate a full byte of decoded data
*/
endstate %= 256;
/* The store into data[] only needs to be done every 8 bits.
* But this avoids a conditional branch, and the writes will
* combine in the cache anyway
*/
d += 8; /* Look past tail */
while (nbits-- != 0) {
int k;
k = (d[nbits].w[(endstate) / 32] >> (endstate % 32)) & 1;
data[nbits >> 3] = endstate = (endstate >> 1) | (k << 7);
}
return 0;
}
/* Delete instance of a Viterbi decoder */
void delete_viterbi29_port(void* p) {
struct v29* vp = p;
if (vp != NULL) {
free(vp->decisions);
free(vp);
}
}
/* C-language butterfly */
#define BFLY(i) {\
unsigned int metric,m0,m1,decision;\
metric = (Branchtab29[0].c[i] ^ sym0) + (Branchtab29[1].c[i] ^ sym1);\
m0 = vp->old_metrics->w[i] + metric;\
m1 = vp->old_metrics->w[i+128] + (510 - metric);\
decision = (signed int)(m0-m1) > 0;\
vp->new_metrics->w[2*i] = decision ? m1 : m0;\
d->w[i/16] |= decision << ((2*i)&31);\
m0 -= (metric+metric-510);\
m1 += (metric+metric-510);\
decision = (signed int)(m0-m1) > 0;\
vp->new_metrics->w[2*i+1] = decision ? m1 : m0;\
d->w[i/16] |= decision << ((2*i+1)&31);\
}
/* Update decoder with a block of demodulated symbols
* Note that nbits is the number of decoded data bits, not the number
* of symbols!
*/
int update_viterbi29_blk_port(void* p, unsigned char* syms, int nbits) {
struct v29* vp = p;
decision_t* d;
if (p == NULL)
return -1;
d = (decision_t*)vp->dp;
while (nbits--) {
void* tmp;
unsigned char sym0, sym1;
int i;
for (i = 0; i < 8; i++)
d->w[i] = 0;
sym0 = *syms++;
sym1 = *syms++;
for (i = 0; i < 128; i++)
BFLY(i);
d++;
tmp = vp->old_metrics;
vp->old_metrics = vp->new_metrics;
vp->new_metrics = tmp;
}
vp->dp = d;
return 0;
}