From: baobrien <baobrien@01035d8c-6547-0410-b346-abe4f91aad63>
Date: Sun, 17 Jan 2016 19:52:10 +0000 (+0000)
Subject: First commit of C Horus modem and test code
X-Git-Url: http://git.whiteaudio.com/gitweb/?a=commitdiff_plain;h=1df07bf29f55e32cf379b8fec76e69936b85f49e;p=freetel-svn-tracking.git

First commit of C Horus modem and test code

git-svn-id: https://svn.code.sf.net/p/freetel/code@2626 01035d8c-6547-0410-b346-abe4f91aad63
---

diff --git a/codec2-dev/octave/fsk_horus_2fsk.m b/codec2-dev/octave/fsk_horus_2fsk.m
new file mode 100644
index 00000000..d4818db6
--- /dev/null
+++ b/codec2-dev/octave/fsk_horus_2fsk.m
@@ -0,0 +1,1008 @@
+% fsk_horus.m
+% David Rowe 10 Oct 2015
+%
+% Experimental near space balloon FSK demodulator
+% Assume high SNR, but fades near end of mission can wipe out a few bits
+% So low SNR perf not a huge issue
+%
+% [X] processing buffers of 1 second
+%     + 8000 samples input
+%     + keep 30 second sliding window to extract packet from
+%     + do fine timing on this
+%     [X] estimate frequency of two tones
+%         + this way we cope with variable shift and drift
+%         + starts to lose it at 8 Eb/No = 8db.  Maybe wider window?
+%     [X] estimate amplitudes and equalise, or limit
+%         + not needed - tones so close in freq unlikely to be significant ampl diff
+%           across SSB rx filter
+% [X] Eb/No point 8dB, 2% ish
+% [X] fine timing and sample slips, +/- 1000ppm (0.1%) clock offset test
+% [ ] bit flipping against CRC
+% [ ] implement CRC
+% [X] frame sync
+% [X] compare to fldigi
+%     + in AWGN channel 3-4dB improvement.  In my tests fldigi can't decode  
+%       with fading model, requires Eb/No > 40dB, this demo useable at Eb/No = 20dB
+% [X] test over range of f1/f2, shifts, timing offsets, clock offsets, Eb/No
+%     [X] +/- 1000ppm clock offset OK at Eb/No = 10dB, starts to lose it at 8dB
+%     [X] tone freq est starts to lose it at 8dB in awgn.  Maybe wider window?
+% [ ] low snr detection of $$$$$$
+%     + we might be able to pick up a "ping" at very low SNRs to help find baloon on ground
+% [ ] streaming, indicator of audio freq, i.e. speaker output?
+
+% horus binary:
+% [ ] BER estimate/found/corrected
+
+1;
+
+function states = fsk_horus_init(Fs,Rs)
+  states.Ndft = 2.^ceil(log2(Fs)); % find nearest power of 2 for effcient FFT
+  states.Fs = Fs;
+  N = states.N = Fs;             % processing buffer size, nice big window for f1,f2 estimation
+  states.Rs = Rs;
+  Ts = states.Ts = Fs/Rs;
+  assert(Ts == floor(Ts), "Fs/Rs must be an integer");
+  states.nsym = N/Ts;            % number of symbols in one procesing frame
+  Nmem = states.Nmem  = N+2*Ts;  % two symbol memory in down converted signals to allow for timing adj
+  states.f1_dc = zeros(1,Nmem);
+  states.f2_dc = zeros(1,Nmem);
+  states.P = 8;                  % oversample rate out of filter
+  assert(Ts/states.P == floor(Ts/states.P), "Ts/P must be an integer");
+
+  states.nin = N;                % can be N +/- Ts/P samples to adjust for sample clock offsets
+  states.verbose = 0;
+  states.phi1 = 0;               % keep down converter osc phase continuous
+  states.phi2 = 0;
+
+  printf("Fs: %d Rs: %d Ts: %d nsym: %d\n", states.Fs, states.Rs, states.Ts, states.nsym);
+
+  % BER stats 
+
+  states.ber_state = 0;
+  states.Tbits = 0;
+  states.Terrs = 0;
+  states.nerr_log = 0;
+
+  states.df = 0;
+  states.f1 = 0;
+  states.f2 = 0;
+  states.norm_rx_timing = 0;
+  states.ppm = 0;
+  states.prev_pkt = [];
+
+  % protocol specific states
+
+  states.rtty = fsk_horus_init_rtty_uw(states);
+  states.binary = fsk_horus_init_binary_uw;
+endfunction
+
+
+% init rtty protocol specifc states
+
+function rtty = fsk_horus_init_rtty_uw(states)
+  % Generate unque word that correlates against the ASCII "$$$$$" that
+  % is at the start of each frame.
+
+  dollar_bits = fliplr([0 1 0 0 1 0 0]);
+  mapped_db = 2*dollar_bits - 1;
+  sync_bits = [1 1 0];
+  mapped_sb = 2*sync_bits - 1;
+  %mapped_sb = [ 0 0 0 ];
+
+  mapped = [mapped_db mapped_sb];
+  npad = rtty.npad = 3;     % one start and two stop bits between 7 bit ascii chars
+  nfield = rtty.nfield = 7; % length of ascii character field
+
+  rtty.uw = [mapped mapped mapped mapped mapped];
+
+  rtty.uw_thresh = length(rtty.uw) - 8; % allow a few bit errors when looking for UW
+
+  rtty.max_packet_len = 1000;
+endfunction
+
+
+
+function binary = fsk_horus_init_binary_uw
+  % Generate 16 bit "$$" unique word that is at the front of every horus binary
+  % packet
+
+  dollar_bits = [0 0 1 0 0 1 0 0];
+  mapped_db = 2*dollar_bits - 1;
+
+  binary.uw = [mapped_db mapped_db];
+  binary.uw_thresh = length(binary.uw);   % no bit errors when looking for UW
+
+  binary.max_packet_len = 400;
+endfunction
+
+
+% test modulator function
+
+function tx  = fsk_horus_mod(states, tx_bits)
+    tx = zeros(states.Ts*length(tx_bits),1);
+    tx_phase = 0;
+    Ts = states.Ts;
+    Fs = states.Fs;
+    f1 = states.f1_tx;  f2 = states.f2_tx;
+    df = states.df; % tone freq change in Hz/s
+    dA = states.dA;
+
+    for i=1:length(tx_bits)
+      if tx_bits(i) == 0
+        tx_phase_vec = tx_phase + (1:Ts)*2*pi*f1/Fs;
+        tx((i-1)*Ts+1:i*Ts) = dA*2.0*cos(tx_phase_vec);
+      else
+        tx_phase_vec = tx_phase + (1:Ts)*2*pi*f2/Fs;
+        tx((i-1)*Ts+1:i*Ts) = 2.0*cos(tx_phase_vec);
+      end
+      tx_phase = tx_phase_vec(Ts) - floor(tx_phase_vec(Ts)/(2*pi))*2*pi;
+      f1 += df*Ts/Fs; f2 += df*Ts/Fs;
+    end
+endfunction
+
+
+% Given a buffer of nin input Rs baud FSK samples, returns nsym bits.
+%
+% Automagically estimates the frequency of the two tones, or
+% looking at it another way, the frequency offset and shift
+%
+% nin is the number of input samples required by demodulator.  This is
+% time varying.  It will nominally be N (8000), and occasionally N +/- 
+% Ts/2 (e.g. 8080 or 7920).  This is how we compensate for differences between the
+% remote tx sample clock and our sample clock.  This function always returns
+% N/Ts (50) demodulated bits.  Variable number of input samples, constant number
+% of output bits.
+
+function [rx_bits states] = fsk_horus_demod(states, sf)
+  N = states.N;
+  Ndft = states.Ndft;
+  Fs = states.Fs;
+  Rs = states.Rs;
+  Ts = states.Ts;
+  nsym = states.nsym;
+  P = states.P;
+  nin = states.nin;
+  verbose = states.verbose;
+  Nmem = states.Nmem;
+
+  assert(length(sf) == nin);
+
+  % find tone frequency and amplitudes ---------------------------------------------
+
+  h = hanning(nin);
+  Sf = fft(sf .* h, Ndft);
+  [m1 m1_index] = max(Sf(1:Ndft/2));
+
+  % zero out region 100Hz either side of max so we can find second highest peak
+
+  Sf2 = Sf;
+  st = m1_index - floor(100*Ndft/Fs);
+  if st < 1
+    st = 1;
+  end
+  en = m1_index + floor(100*Ndft/Fs);
+  if en > Ndft/2
+    en = Ndft/2;
+  end
+  Sf2(st:en) = 0;
+
+  [m2 m2_index] = max(Sf2(1:Ndft/2));
+  
+  % f1 always the lower tone
+
+  if m1_index < m2_index
+    f1 = (m1_index-1)*Fs/Ndft;
+    f2 = (m2_index-1)*Fs/Ndft;
+    twist = 20*log10(m1/m2);
+  else
+    f1 = (m2_index-1)*Fs/Ndft;
+    f2 = (m1_index-1)*Fs/Ndft;
+    twist = 20*log10(m2/m1);
+  end
+
+  states.f1 = f1;
+  states.f2 = f2;
+
+  %printf("ESTF - f1 = %d, f2 = %d, twist = %f \n",f1,f2,twist);
+  if bitand(verbose,0x1)
+    printf("centre: %4.0f shift: %4.0f twist: %3.1f dB\n", (f2+f1)/2, f2-f1, twist);
+  end
+  if bitand(verbose,0x8)
+    printf("f1: %4.0f Hz f2: %4.0f Hz a1: %f a2: %f\n", f1, f2, 2.0*abs(m1)/Ndft, 2.0*abs(m2)/Ndft);
+  end
+
+  % down convert and filter at rate P ------------------------------
+
+  % update filter (integrator) memory by shifting in nin samples
+  
+  nold = Nmem-nin; % number of old samples we retain
+
+  f1_dc = states.f1_dc; 
+  f1_dc(1:nold) = f1_dc(Nmem-nold+1:Nmem);
+  f2_dc = states.f2_dc; 
+  f2_dc(1:nold) = f2_dc(Nmem-nold+1:Nmem);
+
+  % shift down to around DC, ensuring continuous phase from last frame
+
+  phi1_vec = states.phi1 + (1:nin)*2*pi*f1/Fs;
+  phi2_vec = states.phi2 + (1:nin)*2*pi*f2/Fs;
+
+  f1_dc(nold+1:Nmem) = sf' .* exp(-j*phi1_vec);
+  f2_dc(nold+1:Nmem) = sf' .* exp(-j*phi2_vec);
+
+  states.phi1  = phi1_vec(nin);
+  states.phi1 -= 2*pi*floor(states.phi1/(2*pi));
+  states.phi2  = phi2_vec(nin);
+  states.phi2 -= 2*pi*floor(states.phi2/(2*pi));
+
+  % save filter (integrator) memory for next time
+
+  states.f1_dc = f1_dc;
+  states.f2_dc = f2_dc;
+
+  % integrate over symbol period, which is effectively a LPF, removing
+  % the -2Fc frequency image.  Can also be interpreted as an ideal
+  % integrate and dump, non-coherent demod.  We run the integrator at
+  % rate P (1/P symbol offsets) to get outputs at a range of different
+  % fine timing offsets.  We calculate integrator output over nsym+1
+  % symbols so we have extra samples for the fine timing re-sampler at either
+  % end of the array.
+
+
+  length(f1_dc);
+  rx_bits = zeros(1, (nsym+1)*P);
+  for i=1:(nsym+1)*P
+    st = 1 + (i-1)*Ts/P;
+    en = st+Ts-1;
+    f1_int(i) = sum(f1_dc(st:en));
+    f2_int(i) = sum(f2_dc(st:en));
+  end
+  states.f1_int = f1_int;
+  states.f2_int = f2_int;
+  length(f1_dc);
+  % fine timing estimation -----------------------------------------------
+
+  % Non linearity has a spectral line at Rs, with a phase
+  % related to the fine timing offset.  See:
+  %   http://www.rowetel.com/blog/?p=3573 
+  % We have sampled the integrator output at Fs=P samples/symbol, so
+  % lets do a single point DFT at w = 2*pi*f/Fs = 2*pi*Rs/(P*Rs)
+ 
+  Np = length(f1_int);
+  w = 2*pi*(Rs)/(P*Rs);
+
+  %%# same as sum of ((abs(f1_int)-abs(f2_int)).^2) .* exp(-j*w*(0:Np-1))
+  x = ((abs(f1_int)-abs(f2_int)).^2) * exp(-j*w*(0:Np-1))';
+  
+
+  x;
+  norm_rx_timing = angle(x)/(2*pi);
+  rx_timing = norm_rx_timing*P;
+
+  states.x = x;
+  states.rx_timing = rx_timing;
+  prev_norm_rx_timing = states.norm_rx_timing;
+  states.norm_rx_timing = norm_rx_timing;
+
+  % estimate sample clock offset in ppm
+  % d_norm_timing is fraction of symbol period shift over nsym symbols
+
+  d_norm_rx_timing = norm_rx_timing - prev_norm_rx_timing;
+
+  % filter out big jumps due to nin changes
+
+  if abs(d_norm_rx_timing) < 0.2
+    appm = 1E6*d_norm_rx_timing/nsym;
+    states.ppm = 0.9*states.ppm + 0.1*appm;
+  end
+
+  % work out how many input samples we need on the next call. The aim
+  % is to keep angle(x) away from the -pi/pi (+/- 0.5 fine timing
+  % offset) discontinuity.  The side effect is to track sample clock
+  % offsets
+
+  next_nin = N;
+  if norm_rx_timing > 0.25
+     next_nin += Ts/2;
+  end
+  if norm_rx_timing < -0.25;
+     next_nin -= Ts/2;
+  end
+  states.nin = next_nin;
+
+  % Re-sample integrator outputs using fine timing estimate and linear interpolation
+
+  low_sample = floor(rx_timing);
+  fract = rx_timing - low_sample;
+  high_sample = ceil(rx_timing);
+
+  if bitand(verbose,0x2)
+    printf("rx_timing: %3.2f low_sample: %d high_sample: %d fract: %3.3f nin_next: %d\n", rx_timing, low_sample, high_sample, fract, next_nin);
+  end
+
+  f1_int_resample = zeros(1,nsym);
+  f2_int_resample = zeros(1,nsym);
+  rx_bits = zeros(1,nsym);
+  rx_bits_sd = zeros(1,nsym);
+  for i=1:nsym
+    st = i*P+1;
+    f1_int_resample(i) = f1_int(st+low_sample)*(1-fract) + f1_int(st+high_sample)*fract;
+    f2_int_resample(i) = f2_int(st+low_sample)*(1-fract) + f2_int(st+high_sample)*fract;
+    %f1_int_resample(i) = f1_int(st+1);
+    %f2_int_resample(i) = f2_int(st+1);
+    rx_bits(i) = abs(f2_int_resample(i)) > abs(f1_int_resample(i));
+    
+    
+    rx_bits_sd(i) = abs(f2_int_resample(i)) - abs(f1_int_resample(i));
+ end
+
+  states.f1_int_resample = f1_int_resample;
+  states.f2_int_resample = f2_int_resample;
+  states.rx_bits_sd = rx_bits_sd;
+
+  % Eb/No estimation
+
+  x = abs(abs(f1_int_resample) - abs(f2_int_resample));
+  states.EbNodB = 20*log10(1E-6+mean(x)/(1E-6+std(x)));
+endfunction
+
+
+% Look for unique word and return index of first UW bit, or -1 if no
+% UW found Sometimes there may be several matches, returns the
+% position of the best match to UW.
+
+function [uw_start best_corr] = find_uw(states, start_bit, rx_bits)
+  uw = states.uw;
+
+  mapped_rx_bits = 2*rx_bits - 1;
+  best_corr = 0;
+  uw_start = -1;
+
+  for i=start_bit:length(rx_bits) - length(uw)
+    corr  = mapped_rx_bits(i:i+length(uw)-1) * uw';
+    if (corr >= states.uw_thresh) && (corr > best_corr)
+      uw_start = i;
+      best_corr = corr;
+    end
+  end
+endfunction
+
+
+% Extract ASCII string from a Horus frame of bits
+
+function [str crc_ok] = extract_ascii(states, rx_bits_buf, uw_loc)
+  nfield = states.nfield;
+  npad = states.npad;
+
+  str = []; str_dec = []; nstr = 0; ptx_crc = 1; rx_crc = "";
+  endpacket = 0;
+
+  st = uw_loc + length(states.uw);  % first bit of first char
+  en = uw_loc + states.max_packet_len - nfield;
+  %printf("\nst: %d en: %d len: %d\n", st, en, length(rx_bits_buf));
+
+  for i=st:nfield+npad:en
+    field = rx_bits_buf(i:i+nfield-1);
+    ch_dec = field * (2.^(0:nfield-1))';
+
+    % filter out unlikely characters that bit errors may introduce, and ignore \n
+
+    if (ch_dec > 31) && (ch_dec < 91)
+      str = [str char(ch_dec)];
+    else 
+      str = [str char(32)]; % space is "not sure"
+    end
+    nstr++;
+
+    % build up array for CRC16 check
+
+    if !endpacket && (ch_dec == 42)
+      endpacket = 1; 
+      rx_crc = crc16(str_dec);      % found a '*' so that's the end of the string for CRC calculations
+      ptx_crc = nstr+1;             % this is where the transmit CRC starts
+    end
+    if !endpacket
+      str_dec = [str_dec ch_dec];
+    end
+  end
+
+  if (ptx_crc+3) <= length(str)
+    tx_crc = str(ptx_crc:ptx_crc+3);
+    crc_ok = strcmp(tx_crc, rx_crc);
+  else
+    crc_ok = 0;
+  end
+
+  str = str(1:ptx_crc-2);
+
+endfunction
+
+
+% Use soft decision information to find bits most likely in error.  I think
+% this is some form of maximum likelihood decoding.
+
+function [str crc_ok rx_bits_log_flipped] = sd_bit_flipping(states, rx_bits_log, rx_bits_sd_log, st, en);
+
+  % force algorithm to ignore rs232 sync bits by marking them as "very likely", they have
+  % no input to crc algorithm
+
+  nfield = states.nfield;
+  npad = states.npad;
+  for i=st:nfield+npad:en
+    rx_bits_sd_log(i+nfield:i+nfield+npad-1) = 1E6;
+  end
+
+  % make a list of bits with smallest soft decn values
+
+  [dodgy_bits_mag dodgy_bits_index] = sort(abs(rx_bits_sd_log(st+length(states.uw):en)));
+  dodgy_bits_index += length(states.uw) + st - 1;
+  nbits = 6;
+  ntries = 2^nbits;
+  str = "";
+  crc_ok = 0;
+  
+  % try various combinations of these bits
+
+  for i=1:ntries-1
+    error_mask = zeros(1, length(rx_bits_log));
+    for b=1:nbits
+      x = bitget(i,b);
+      bit_to_flip = dodgy_bits_index(b);
+      error_mask(bit_to_flip) = x;
+      %printf("st: %d i: %d b: %d x: %d index: %d\n", st, i,b,x,bit_to_flip);
+    end
+    rx_bits_log_flipped = xor(rx_bits_log, error_mask);
+    [str_flipped crc_ok_flipped] = extract_ascii(states, rx_bits_log_flipped, st);
+    if crc_ok_flipped
+      %printf("Yayy we fixed a packet by flipping with pattern %d\n", i);
+      str = str_flipped;
+      crc_ok = crc_ok_flipped;
+    end
+  end
+endfunction
+
+
+% Extract as many ASCII packets as we can from a great big buffer of bits
+
+function extract_and_print_rtty_packets(states, rx_bits_log, rx_bits_sd_log)
+
+  % use UWs to delimit start and end of data packets
+
+  bit = 1;
+  nbits = length(rx_bits_log);
+  nfield = states.rtty.nfield;
+  npad = states.rtty.npad;
+
+  uw_loc = find_uw(states.rtty, bit, rx_bits_log);
+
+  while (uw_loc != -1)
+
+    if (uw_loc + states.rtty.max_packet_len) < nbits
+      % Now start picking out 7 bit ascii chars from frame.  It has some
+      % structure so we can guess where fields are.  I hope we don't get
+      % RS232 idle bits stuck into it anywhere, ie "bit fields" don't
+      % change dynamically.
+
+      % dump msg bits so we can use them as a test signal
+      %msg = rx_bits_log(st:uw_loc-1);
+      %save -ascii horus_msg.txt msg
+
+      % simulate bit error for testing
+      %rx_bits_log(st+200) = xor(rx_bits_log(st+100),1);
+      %rx_bits_sd_log(st+100) = 0;
+      
+      [str crc_ok] = extract_ascii(states.rtty, rx_bits_log, uw_loc);
+
+      if crc_ok == 0
+        [str_flipped crc_flipped_ok rx_bits_log] = sd_bit_flipping(states.rtty, rx_bits_log, rx_bits_sd_log, uw_loc, uw_loc+states.rtty.max_packet_len); 
+      end
+
+      % update memory of previous packet, we use this to guess where errors may be
+      if crc_ok || crc_flipped_ok
+        states.prev_pkt = rx_bits_log(uw_loc+length(states.rtty.uw):uw_loc+states.rtty.max_packet_len);
+      end
+
+      if crc_ok
+        str = sprintf("%s CRC OK", str);
+      else
+        if crc_flipped_ok
+          str = sprintf("%s fixed", str_flipped);
+        else
+          str = sprintf("%s CRC BAD", str);
+        end
+      end
+      printf("%s\n", str);
+    end
+
+    % look for next packet
+
+    bit = uw_loc + length(states.rtty.uw);
+    uw_loc = find_uw(states.rtty, bit, rx_bits_log);
+
+  endwhile
+endfunction
+ 
+
+% Extract as many binary packets as we can from a great big buffer of bits,
+% and send them to the C decoder for FEC decoding.
+% horus_l2 can be compiled a bunch of different ways.  You need to
+% compile with:
+%   codec2-dev/src$ gcc horus_l2.c -o horus_l2 -Wall -DDEC_RX_BITS -DHORUS_L2_RX
+
+function extract_and_decode_binary_packets(states, rx_bits_log)
+
+  % use UWs to delimit start and end of data packets
+
+  bit = 1;
+  nbits = length(rx_bits_log);
+
+  uw_loc = find_uw(states.binary, bit, rx_bits_log);
+
+  while (uw_loc != -1)
+
+    if (uw_loc+states.binary.max_packet_len) < nbits
+      %printf("st: %d uw_loc: %d\n", st, uw_loc);
+
+      % OK we have a packet delimited by two UWs.  Lets convert the bit
+      % stream into bytes and save for decoding
+
+      pin = uw_loc;    
+      for i=1:45
+        rx_bytes(i) = rx_bits_log(pin:pin+7) * (2.^(7:-1:0))';
+        pin += 8;
+        %printf("%d 0x%02x\n", i, rx_bytes(i));
+      end
+
+      f=fopen("horus_rx_bits_binary.txt","wt");
+      fwrite(f, rx_bytes, "uchar");
+      fclose(f);
+
+      system("../src/horus_l2");  % compile instructions above
+    end
+
+    bit = uw_loc + length(states.binary.uw);
+    uw_loc = find_uw(states.binary, bit, rx_bits_log);
+   
+  endwhile
+endfunction
+ 
+
+% BER counter and test frame sync logic
+
+function states = ber_counter(states, test_frame, rx_bits_buf)
+  nsym = states.nsym;
+  state = states.ber_state;
+  next_state = state;
+
+  if state == 0
+
+    % try to sync up with test frame
+
+    nerrs_min = nsym;
+    for i=1:nsym
+      error_positions = xor(rx_bits_buf(i:nsym+i-1), test_frame);
+      nerrs = sum(error_positions);
+      if nerrs < nerrs_min
+        nerrs_min = nerrs;
+        states.coarse_offset = i;
+      end
+    end
+    if nerrs_min/nsym < 0.05 
+      next_state = 1;
+    end
+    if bitand(states.verbose,0x4)
+      printf("coarse offset: %d nerrs_min: %d next_state: %d\n", states.coarse_offset, nerrs_min, next_state);
+    end
+  end
+
+  if state == 1  
+
+    % we're synced up, lets measure bit errors
+
+    error_positions = xor(rx_bits_buf(states.coarse_offset:states.coarse_offset+nsym-1), test_frame);
+    nerrs = sum(error_positions);
+    if nerrs/nsym > 0.1
+      next_state = 0;
+    else
+      states.Terrs += nerrs;
+      states.Tbits += nsym;
+      states.nerr_log = [states.nerr_log nerrs];
+    end
+  end
+
+  states.ber_state = next_state;
+endfunction
+
+
+% simulation of tx and rx side, add noise, channel impairments ----------------------
+
+function run_sim(test_frame_mode)
+  frames = 60;
+  EbNodB = 10;
+  timing_offset = 0.0; % see resample() for clock offset below
+  fading = 0;          % modulates tx power at 2Hz with 20dB fade depth, 
+                       % to simulate balloon rotating at end of mission
+  df     = 0;          % tx tone freq drift in Hz/s
+  dA     = 1;          % amplitude imbalance of tones (note this affects Eb so not a gd idea)
+
+  more off
+  rand('state',1); 
+  randn('state',1);
+
+  % ----------------------------------------------------------------------
+
+  % sm2000 config ------------------------
+  %states = fsk_horus_init(96000, 1200);
+  %states.f1_tx = 4000;
+  %states.f2_tx = 5200;
+
+  if test_frame_mode == 4
+    % horus rtty config ---------------------
+    states = fsk_horus_init(8000, 100);
+    states.f1_tx = 1200;
+    states.f2_tx = 1600;
+    states.tx_bits_file = "horus_tx_bits_rtty.txt"; % Octave file of bits we FSK modulate
+    
+  end
+                               
+  if test_frame_mode == 5
+    % horus binary config ---------------------
+    states = fsk_horus_init(8000, 100);
+    states.f1_tx = 1200;
+    states.f2_tx = 1600;
+    %%%states.tx_bits_file = "horus_tx_bits_binary.txt"; % Octave file of bits we FSK modulate
+	states.tx_bits_file = "horus_payload_rtty.txt"
+  end
+
+  % ----------------------------------------------------------------------
+
+  states.verbose = 0x1;
+  N = states.N;
+  P = states.P;
+  Rs = states.Rs;
+  nsym = states.nsym;
+  Fs = states.Fs;
+  states.df = df;
+  states.dA = dA;
+
+  EbNo = 10^(EbNodB/10);
+  variance = states.Fs/(states.Rs*EbNo);
+
+  % set up tx signal with payload bits based on test mode
+
+  if test_frame_mode == 1
+     % test frame of bits, which we repeat for convenience when BER testing
+    test_frame = round(rand(1, states.nsym));
+    tx_bits = [];
+    for i=1:frames+1
+      tx_bits = [tx_bits test_frame];
+    end
+  end
+  if test_frame_mode == 2
+    % random bits, just to make sure sync algs work on random data
+    tx_bits = round(rand(1, states.nsym*(frames+1)));
+  end
+  if test_frame_mode == 3
+    % ...10101... sequence
+    tx_bits = zeros(1, states.nsym*(frames+1));
+    tx_bits(1:2:length(tx_bits)) = 1;
+  end
+ 
+  if (test_frame_mode == 4) || (test_frame_mode == 5)
+
+    % load up a horus msg from disk and modulate that
+
+    test_frame = load(states.tx_bits_file);
+    ltf = length(test_frame);
+    ntest_frames = ceil((frames+1)*nsym/ltf);
+    tx_bits = [];
+    for i=1:ntest_frames
+      tx_bits = [tx_bits test_frame];
+    end
+  end
+
+  tx = fsk_horus_mod(states, tx_bits);
+
+  %tx = resample(tx, 1000, 1001); % simulated 1000ppm sample clock offset
+
+  if fading
+     ltx = length(tx);
+     tx = tx .* (1.1 + cos(2*pi*2*(0:ltx-1)/Fs))'; % min amplitude 0.1, -20dB fade, max 3dB
+  end
+
+  noise = sqrt(variance)*randn(length(tx),1);
+  rx    = tx + noise;
+  %rx = real(rx);
+  %b1 = fir2(100, [0 4000 5200 48000]/48000, [1 1 0.5 0.5]);
+  %rx = filter(b1,1,rx);
+  %[b a] = cheby2(6,40,[3000 6000]/(Fs/2));
+  %rx = filter(b,a,rx);
+  %rx = sign(rx);
+  %rx(find (rx > 1)) = 1;
+  %rx(find (rx < -1)) = -1;
+
+  % dump simulated rx file
+  ftx=fopen("fsk_horus_100bd_binary.raw","wb"); rxg = rx*1000; fwrite(ftx, rxg, "short"); fclose(ftx);
+
+  timing_offset_samples = round(timing_offset*states.Ts);
+  st = 1 + timing_offset_samples;
+  rx_bits_buf = zeros(1,2*nsym);
+  x_log = [];
+  norm_rx_timing_log = [];
+  f1_int_resample_log = [];
+  f2_int_resample_log = [];
+  f1_log = f2_log = [];
+  EbNodB_log = [];
+  rx_bits_log = [];
+  rx_bits_sd_log = [];
+
+  for f=1:frames
+
+    % extract nin samples from input stream
+
+    nin = states.nin;
+    en = st + states.nin - 1;
+    sf = rx(st:en);
+    st += nin;
+
+    % demodulate to stream of bits
+
+    [rx_bits states] = fsk_horus_demod(states, sf);
+    rx_bits_buf(1:nsym) = rx_bits_buf(nsym+1:2*nsym);
+    rx_bits_buf(nsym+1:2*nsym) = rx_bits;
+    rx_bits_log = [rx_bits_log rx_bits];
+    rx_bits_sd_log = [rx_bits_sd_log states.rx_bits_sd];
+
+    norm_rx_timing_log = [norm_rx_timing_log states.norm_rx_timing];
+    x_log = [x_log states.x];
+    f1_int_resample_log = [f1_int_resample_log abs(states.f1_int_resample)];
+    f2_int_resample_log = [f2_int_resample_log abs(states.f2_int_resample)];
+    f1_log = [f1_log states.f1];
+    f2_log = [f2_log states.f2];
+    EbNodB_log = [EbNodB_log states.EbNodB];
+
+    if test_frame_mode == 1
+       states = ber_counter(states, test_frame, rx_bits_buf);
+    end
+  end
+
+  if test_frame_mode == 1
+    printf("frames: %d Tbits: %d Terrs: %d BER %4.3f\n", frames, states.Tbits,states. Terrs, states.Terrs/states.Tbits);
+  end
+
+  if test_frame_mode == 4
+    extract_and_print_rtty_packets(states, rx_bits_log, rx_bits_sd_log)
+  end
+
+  if test_frame_mode == 5
+    extract_and_decode_binary_packets(states, rx_bits_log);
+  end
+
+  figure(1);
+  plot(f1_int_resample_log,'+')
+  hold on;
+  plot(f2_int_resample_log,'g+')
+  hold off;
+
+  figure(2)
+  clf
+  m = max(abs(x_log));
+  plot(x_log,'+')
+  axis([-m m -m m])
+  title('fine timing metric')
+
+  figure(3)
+  clf
+  subplot(211)
+  plot(norm_rx_timing_log);
+  axis([1 frames -1 1])
+  title('norm fine timing')
+  subplot(212)
+  plot(states.nerr_log)
+  title('num bit errors each frame')
+
+  figure(4)
+  clf
+  subplot(211)
+  plot(real(rx(1:Fs)))
+  title('rx signal at demod input')
+  subplot(212)
+  plot(abs(fft(rx(1:Fs))))
+
+  figure(5)
+  clf
+  plot(f1_log)
+  hold on;
+  plot(f2_log,'g');
+  hold off;
+  title('tone frequencies')
+  axis([1 frames 0 Fs/2])
+
+  figure(6)
+  clf
+  plot(EbNodB_log);
+  title('Eb/No estimate')
+
+endfunction
+
+
+% demodulate a file of 8kHz 16bit short samples --------------------------------
+
+function rx_bits_log = demod_file(filename, test_frame_mode, noplot)
+  fin = fopen(filename,"rb"); 
+  more off;
+
+  %states = fsk_horus_init(96000, 1200);
+
+  if test_frame_mode == 4
+    % horus rtty config ---------------------
+    states = fsk_horus_init(8000, 100);
+    uwstates = fsk_horus_init_rtty_uw(states);
+  end
+                               
+  if test_frame_mode == 5
+    % horus binary config ---------------------
+    states = fsk_horus_init(8000, 100);
+    uwstates = fsk_horus_init_binary_uw;
+  end
+
+  states.verbose = 0x1;
+
+  N = states.N;
+  P = states.P;
+  Rs = states.Rs;
+  nsym = states.nsym;
+  rand('state',1); 
+  test_frame = round(rand(1, states.nsym));
+
+  frames = 0;
+  rx = [];
+  rx_bits_log = [];
+  rx_bits_sd_log = [];
+  norm_rx_timing_log = [];
+  f1_int_resample_log = [];
+  f2_int_resample_log = [];
+  EbNodB_log = [];
+  ppm_log = [];
+  f1_log = [];
+  f2_log = [];
+  rx_bits_buf = zeros(1,2*nsym);
+
+  % First extract raw bits from samples ------------------------------------------------------
+
+  printf("demod of raw bits....\n");
+
+  finished = 0;
+  while (finished == 0)
+
+    % hit any key to finish (useful for real time streaming)
+
+    %x = kbhit(1);
+    %if length(x)
+    %  finished = 1;
+    %end
+
+    % extract nin samples from input stream
+
+    nin = states.nin;
+    [sf count] = fread(fin, nin, "short");
+    rx = [rx; sf];
+
+    if count == nin
+      frames++;
+
+      % demodulate to stream of bits
+
+      [rx_bits states] = fsk_horus_demod(states, sf);
+      rx_bits_buf(1:nsym) = rx_bits_buf(nsym+1:2*nsym);
+      rx_bits_buf(nsym+1:2*nsym) = rx_bits; % xor(rx_bits,ones(1,nsym));
+      rx_bits_log = [rx_bits_log rx_bits];
+      rx_bits_sd_log = [rx_bits_sd_log states.rx_bits_sd];
+      norm_rx_timing_log = [norm_rx_timing_log states.norm_rx_timing];
+      f1_int_resample_log = [f1_int_resample_log abs(states.f1_int_resample)];
+      f2_int_resample_log = [f2_int_resample_log abs(states.f2_int_resample)];
+      EbNodB_log = [EbNodB_log states.EbNodB];
+      ppm_log = [ppm_log states.ppm];
+      f1_log = [f1_log states.f1];
+      f2_log = [f2_log states.f2];
+
+      if test_frame_mode == 1
+        states = ber_counter(states, test_frame, rx_bits_buf);
+        if states.ber_state == 1
+          states.verbose = 0;
+        end
+      end
+    else
+      finished = 1;
+    end
+  end
+  fclose(fin);
+
+  if exist("noplot") == 0
+    printf("plotting...\n");
+
+    figure(1);
+    plot(f1_log);
+    hold on;
+    plot(f2_log,'g');
+    hold off;
+
+    figure(2);
+    plot(f1_int_resample_log,'+')
+    hold on;
+    plot(f2_int_resample_log,'g+')
+    hold off;
+
+    figure(3)
+    clf
+    subplot(211)
+    plot(norm_rx_timing_log)
+    axis([1 frames -0.5 0.5])
+    title('norm fine timing')
+    grid
+    subplot(212)
+    plot(states.nerr_log)
+    title('num bit errors each frame')
+ 
+    figure(4)
+    clf
+    plot(EbNodB_log);
+    title('Eb/No estimate')
+
+    figure(5)
+    clf
+    rx_nowave = rx(1000:length(rx));
+    subplot(211)
+    plot(rx_nowave(1:states.Fs));
+    title('input signal to demod (1 sec)')
+    xlabel('Time (samples)');
+    axis([1 states.Fs -35000 35000])
+
+    % normalise spectrum to 0dB full scale with a 32767 sine wave input
+
+    subplot(212)
+    RxdBFS = 20*log10(abs(fft(rx_nowave(1:states.Fs)))) - 20*log10((states.Fs/2)*32767);
+    plot(RxdBFS)
+    axis([1 states.Fs/2 -80 0])
+    xlabel('Frequency (Hz)');
+
+    figure(6);
+    clf
+    plot(ppm_log)
+    title('Sample clock (baud rate) offset in PPM');
+  end
+
+  if test_frame_mode == 1
+    printf("frames: %d Tbits: %d Terrs: %d BER %4.3f EbNo: %3.2f\n", frames, states.Tbits,states. Terrs, states.Terrs/states.Tbits, mean(EbNodB_log));
+  end
+
+  % we can decode both protocols at the same time
+
+  if (test_frame_mode == 4) || (test_frame_mode == 5)
+    extract_and_print_rtty_packets(states, rx_bits_log, rx_bits_sd_log)
+    extract_and_decode_binary_packets(states, rx_bits_log);
+  end
+endfunction
+
+
+% run test functions from here during development
+
+if exist("fsk_horus_as_a_lib") == 0
+  run_sim(5);
+  %rx_bits = demod_file("horus.raw",4);
+  %rx_bits = demod_file("fsk_horus_100bd_binary.raw",5);
+  %rx_bits = demod_file("~/Desktop/phorus_binary_ascii.wav",4);
+  %rx_bits = demod_file("~/Desktop/binary/horus_160102_binary_rtty_2.wav",4);
+  
+  
+  %rx_bits = demod_file("~/Desktop/horus_160102_vk5ei_capture2.wav",4);
+  %rx_bits = demod_file("~/Desktop/horus_rtty_binary.wav",4);
+  %rx_bits = demod_file("t.raw",5);
+  %rx_bits = demod_file("~/Desktop/fsk_horus_10dB_1000ppm.wav",4);
+  %rx_bits = demod_file("~/Desktop/fsk_horus_6dB_0ppm.wav",4);
+  %rx_bits = demod_file("test.raw",1,1);
+  %rx_bits = demod_file("/dev/ttyACM0",1);
+  %rx_bits = demod_file("fsk_horus_rx_1200_96k.raw",1);
+  %rx_bits = demod_file("mp.raw",4);
+  %rx_bits = demod_file("~/Desktop/launchbox_v2_landing_8KHz_final.wav",4);
+end
diff --git a/codec2-dev/octave/yafsk.m b/codec2-dev/octave/yafsk.m
index 9a5a1f81..d38169c9 100644
--- a/codec2-dev/octave/yafsk.m
+++ b/codec2-dev/octave/yafsk.m
@@ -8,14 +8,16 @@
 % [x] - Direct SDR modulator, probably not FM based
 % [x] - Direct SDR non-coherent demodulator
 %    [x] - Core demodulation routine
-%    | | - Timing offset estimation
+%    |o| - Timing offset estimation
 %    < >- Freq. offset estimation
-%    ( ) - Bit slip, maybe
+%    (+) - Bit slip, maybe
 % { } - Port sim from fsk_horus
 % [ ] - The C port
 % [ ] - Some stuff to verify the C port
-
+% [ ] - 4FSK variant
+%    ( ) - All of that other stuff, but for 4fsk
 %clear all;
+
 graphics_toolkit('gnuplot');
 %fm
 
@@ -24,7 +26,9 @@ pkg load signal;
 %Basic parameters for a simple FSK modem
 fsk_setup_info.Rs = 4800;  % Symbol rate
 fsk_setup_info.nfsk = 2;      % Number of unique symbols. Must be 2.
+fsk_setup_info.P = 5;		%Something something fine timing est
 fsk_setup_info.Fs = 48000; % Sample frequency
+fsk_setup_info.timing_syms = 10; %How many symbols over which to figure fine timing
 fsk_setup_info.Fsym = fsk_setup_info.Rs; %Symbol spacing
 fsk_setup_info.txmap = @(bits) bits+1; %Map TX bits to 2fsk symbols
 fsk_setup_info.rxmap = @(syms) syms==1; %Map 2fsk RX symbols to bits
@@ -37,7 +41,9 @@ function states = yafsk_init(fsk_config)
   Ts = states.Ts = Fs/Rs;
   Fsym = states.Fsym = fsk_config.Fsym;
   states.config = fsk_config;
-
+  P = states.P = fsk_config.P;
+  timing_syms = states.timing_syms = fsk_config.timing_syms;
+  
   if nfsk != 2
     error("Gotta be 2fsk")
   endif
@@ -49,7 +55,12 @@ function states = yafsk_init(fsk_config)
   states.dc = zeros(1,nfsk);
   states.rx_phi = ones(1,nfsk);
   states.isamp = 0;
+  states.ssamp = 0;
   states.sums = zeros(1,nfsk);
+  states.ssums = zeros(1,nfsk);
+  timing_db1 = timing_db2 = zeros(1,timing_syms*(Ts/P))
+  states.timing_db1 = timing_db1;
+  states.timing_db2 = timing_db2;
 endfunction
 
 function [tx states] = yafsk_mod(states,bits)
@@ -87,7 +98,7 @@ function d = idmp(data, M)
     end
 endfunction
 
-function [bits states phis] = yafsk_demod_2a(states,rx)
+function [bits states phis softsyms] = yafsk_demod_2a(states,rx)
   fine_timing = 1;
   Fs = states.Fs;
   Rs = states.Rs;
@@ -104,11 +115,20 @@ function [bits states phis] = yafsk_demod_2a(states,rx)
 
   sum_f1 = states.sums(1);
   sum_f2 = states.sums(2);
-
+  
+  ssum_f1 = states.ssums(1);
+  ssum_f2 = states.ssums(2);
+  
+  timing_db1 = states.timing_db1;
+  timing_db2 = states.timing_db2;
+  
+  ssamp = states.ssamp;
   isamp = states.isamp;
+  
   symcnt = 1;
   subcnt = 1;
   syms = [0];
+  softsyms = [0];
   sums1 = [0];
   sums2 = [0];
   phis = [0];
@@ -119,12 +139,11 @@ function [bits states phis] = yafsk_demod_2a(states,rx)
   ssum_f2 = 0;
   ssamp=0;
  
-  timing_syms = 10;
+  timing_syms = states.timing_syms;
   timing_nudge = .09; %How far to 'nudge' the sampling point
                       %This really ought to be fixed somewhere else
                       
-  timing_samps = timing_syms*(Ts/4);
-  timing_db2 = timing_db1 = zeros(1,timing_samps);  
+  timing_samps = timing_syms*(Ts/P);
 
   for ii = (1:length(rx))
     phy_f1 *= dphase_f1;   %Spin the oscillators
@@ -147,6 +166,7 @@ function [bits states phis] = yafsk_demod_2a(states,rx)
     isamp += 1;
     if isamp>=Ts %If it's time to take a sample and spit out a symbol..
       syms(symcnt) = (abs(sum_f1)>abs(sum_f2))+1; %Spit out a symbol
+      softsyms(symcnt) = abs(sum_f1) - abs(sum_f2);
       symcnt += 1;
       
       %Fine timing estimation and adjustment
@@ -185,6 +205,12 @@ function [bits states phis] = yafsk_demod_2a(states,rx)
       sum_f2 = 0;
       isamp -= Ts;    %Reset integrator count
      
+      if(mod(symcnt,10000)==0)
+        ab_f1 = abs(phy_f1)
+        phy_f1 = phy_f1/ab_f1;
+        ab_f2 = abs(phy_f2)
+        phy_f2 = phy_f2/ab_f2;
+	  endif
 
     endif
 
@@ -211,7 +237,14 @@ function [bits states phis] = yafsk_demod_2a(states,rx)
   states.sums(1) = sum_f1;
   states.sums(2) = sum_f2;
 
+  states.ssum(1) = ssum_f1;
+  states.ssum(2) = ssum_f2;
+
+  states.ssamp = ssamp;
   states.isamp = isamp;
+  
+  states.timing_db1 = timing_db1;
+  states.timing_db2 = timing_db2;
 
   bits = states.config.rxmap(syms);
   
@@ -440,7 +473,6 @@ function run_sim
 
 endfunction
 
-
 % Bit error rate test ----------------------------------------------------------
 % Params - aEsNodB - EbNo in decibels
 %        - timing_offset - how far the fine timing is offset
@@ -449,7 +481,7 @@ endfunction
 % Returns - ber - teh measured BER
 %         - thrcoh - theory BER of a coherent demod
 %         - thrncoh - theory BER of non-coherent demod
-function [ber thrcoh thrncoh rxphi] = nfbert_2(aEsNodB,modem_config, bitcnt=12000, timing_offset = 1, freq_offset = 0, burst = 0,samp_clk_offset = 0)
+function [ber thrcoh thrncoh rxphis] = nfbert_2(aEsNodB,modem_config, bitcnt=12000, timing_offset = 1, freq_offset = 0, burst = 0,samp_clk_offset = 0)
 
   rand('state',1); 
   randn('state',1);
@@ -457,7 +489,11 @@ function [ber thrcoh thrncoh rxphi] = nfbert_2(aEsNodB,modem_config, bitcnt=1200
   %How many bits should this test run?
   %bitcnt = 12000;
   
-  test_bits = [zeros(1,16) 1 0 0 1 1 0 0 1 0 1 0 1 0 1 0 1 (rand(1,bitcnt)>.5) ones(1,300) zeros(1,300)]; %Random bits. Pad with zeros to prime the filters
+  framesync = []
+  framehdr = [1 0 1 0 1 0 1 0 0 1 1 1 0 1 0 0]
+  convhdr = framehdr;%[.5 -.5 .5 -.5 1 -1 1 -1 -1 1 1 1 -1 1 -1 -1];
+  %framehdr = [1 0 1 0 0 0 1 0 1 0 0 1]
+  test_bits = [framesync framehdr (rand(1,bitcnt)>.5)]; %Random bits. Pad with zeros to prime the filters
   states.M = 1;
   states = yafsk_init(modem_config);
   
@@ -489,12 +525,21 @@ function [ber thrcoh thrncoh rxphi] = nfbert_2(aEsNodB,modem_config, bitcnt=1200
   
   rx    = rx(timing_offset:length(rx));
   
+  [rx_bits states rxphis sbits] = yafsk_demod_2a(states,rx);
+  
   
-  [rx_bits states rxphis] = yafsk_demod_2a(states,rx);
   ber = 1;
   
+  hsig = -1*fliplr((framehdr*2)-1);
+  figure(3);
+  corrfd = conv(hsig,sbits);
+  for ii = (1:length(corrfd)-length(hsig))
+    secte = sum(abs(sbits(ii:ii+length(hsig))).^2);
+	corrfd = corrfd(ii)/secte;
+  end
   %thing to account for offset from input data to output data
   %No preamble detection yet
+  figure(4);
   plot(rxphis);
   ox = 1;
   
@@ -522,6 +567,14 @@ function [ber thrcoh thrncoh rxphi] = nfbert_2(aEsNodB,modem_config, bitcnt=1200
     ber = min([ber bern]);
   end
   
+  %Try to find frame header
+  for offset = (1:length(rx_bits)-length(framehdr))
+	hd = sum(xor(framehdr,rx_bits(offset:offset-1+length(framehdr))));
+	if(hd<=2)
+		printf("Found possible header at offset %d\n",offset);
+	endif
+  end
+  
   figure(5);
   stairs(1.0*xerr);
   
diff --git a/codec2-dev/src/CMakeLists.txt b/codec2-dev/src/CMakeLists.txt
index 68241b89..6fe28f9d 100644
--- a/codec2-dev/src/CMakeLists.txt
+++ b/codec2-dev/src/CMakeLists.txt
@@ -141,7 +141,9 @@ set(CODEC2_SRCS
     fifo.c
     fdmdv.c
     fm.c
+    fsk.c
     kiss_fft.c
+    kiss_fftr.c
     linreg.c
     interp.c
     lsp.c
@@ -166,6 +168,7 @@ set(CODEC2_PUBLIC_HEADERS
     codec2_fdmdv.h
     codec2_cohpsk.h
     codec2_fm.h
+    fsk.h
     codec2_fifo.h
     comp.h
     comp_prim.h
@@ -232,6 +235,12 @@ target_link_libraries(freedv_tx ${CMAKE_REQUIRED_LIBRARIES} codec2)
 add_executable(freedv_rx freedv_rx.c)
 target_link_libraries(freedv_rx ${CMAKE_REQUIRED_LIBRARIES} codec2)
 
+add_executable(fsk_mod fsk_mod.c)
+target_link_libraries(fsk_mod ${CMAKE_REQUIRED_LIBRARIES} codec2)
+
+add_executable(fsk_demod fsk_demod.c)
+target_link_libraries(fsk_demod ${CMAKE_REQUIRED_LIBRARIES} codec2)
+
 add_executable(fm_demod fm_demod.c fm.c)
 target_link_libraries(fm_demod ${CMAKE_REQUIRED_LIBRARIES})
 
@@ -270,6 +279,7 @@ install(TARGETS
     fdmdv_get_test_bits
     fdmdv_mod fdmdv_demod
     fm_demod
+    fsk_mod
     fdmdv_put_test_bits
     fdmdv_interleave
     insert_errors
diff --git a/codec2-dev/src/fsk.c b/codec2-dev/src/fsk.c
new file mode 100644
index 00000000..a426b511
--- /dev/null
+++ b/codec2-dev/src/fsk.c
@@ -0,0 +1,572 @@
+/*---------------------------------------------------------------------------*\
+
+  FILE........: fsk.c
+  AUTHOR......: Brady O'Brien
+  DATE CREATED: 7 January 2016
+
+  C Implementation of 2FSK modulator/demodulator, based on octave/fsk_horus.m
+
+\*---------------------------------------------------------------------------*/
+
+/*
+  Copyright (C) 2016 David Rowe
+
+  All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU Lesser General Public License version 2.1, as
+  published by the Free Software Foundation.  This program is
+  distributed in the hope that it will be useful, but WITHOUT ANY
+  WARRANTY; without even the implied warranty of MERCHANTABILITY or
+  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+  License for more details.
+
+  You should have received a copy of the GNU Lesser General Public License
+  along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+/*---------------------------------------------------------------------------*\
+
+                               DEFINES
+
+\*---------------------------------------------------------------------------*/
+
+/* P oversampling rate constant -- should probably be init-time configurable */
+#define ct_P 8
+
+/*---------------------------------------------------------------------------*\
+
+                               INCLUDES
+
+\*---------------------------------------------------------------------------*/
+
+#include <assert.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <math.h>
+
+#include "fsk.h"
+#include "comp_prim.h"
+#include "kiss_fftr.h"
+
+/*---------------------------------------------------------------------------*\
+
+                               FUNCTIONS
+
+\*---------------------------------------------------------------------------*/
+
+/*---------------------------------------------------------------------------*\
+
+  FUNCTION....: fsk_create
+  AUTHOR......: Brady O'Brien
+  DATE CREATED: 7 January 2016
+  
+  Create and initialize an instance of the FSK modem. Returns a pointer
+  to the modem state/config struct. One modem config struct may be used
+  for both mod and demod. returns NULL on failure.
+
+\*---------------------------------------------------------------------------*/
+
+struct FSK * fsk_create(int Fs, int Rs, int tx_f1,int tx_f2)
+{
+    struct FSK *fsk;
+    int i;
+    int Ndft = 0;
+    int memold;
+    
+    /* Check configuration validity */
+    assert(Fs > 0 );
+    assert(Rs > 0 );
+    assert(tx_f1 > 0);
+    assert(tx_f2 > 0);
+    assert(ct_P > 0);
+    /* Ts (Fs/Rs) must be an integer */
+    assert( (Fs%Rs) == 0 );
+    /* Ts/P (Fs/Rs/P) must be an integer */
+    assert( ((Fs/Rs)%ct_P) == 0 );
+    
+    fsk = (struct FSK*) malloc(sizeof(struct FSK));
+    if(fsk == NULL) return NULL;
+    
+    /* Find smallest 2^N value that fits Fs for efficient FFT */
+    /* It would probably be better to use KISS-FFt's routine here */
+    for(i=1; i; i<<=1)
+        if(Fs&i)
+            Ndft = i<<1;
+    
+    /* Set constant config parameters */
+    fsk->Fs = Fs;
+    fsk->Rs = Rs;
+    fsk->Ts = Fs/Rs;
+    fsk->N = Fs;
+    fsk->P = ct_P;
+    fsk->Nsym = fsk->N/fsk->Ts;
+    fsk->Ndft = Ndft;
+    fsk->Nmem = fsk->N+(2*fsk->Ts);
+    fsk->f1_tx = tx_f1;
+    fsk->f2_tx = tx_f2;
+    fsk->nin = fsk->N;
+    
+    /* Set up rx state */
+    fsk->phi1_d = 0;
+    fsk->phi2_d = 0;
+    fsk->phi1_c.real = 1;
+    fsk->phi1_c.imag = 0;
+    fsk->phi2_c.real = 1;
+    fsk->phi2_c.imag = 0;
+    
+    memold = (4*fsk->Ts);
+    
+    fsk->nstash = memold; 
+    fsk->samp_old = (float*) malloc(sizeof(float)*memold);
+    if(fsk->samp_old == NULL){
+        free(fsk);
+        return NULL;
+    }
+    
+    for(int i=0;i<memold;i++)fsk->samp_old[i]=0;
+    
+    fsk->fft_cfg = kiss_fftr_alloc(Ndft,0,NULL,NULL);
+    if(fsk->fft_cfg == NULL){
+        free(fsk->samp_old);
+        free(fsk);
+        return NULL;
+    }
+    
+    fsk->norm_rx_timing = 0;
+    
+    /* Set up tx state */
+    fsk->tx_phase = 0;
+    fsk->tx_phase_c.imag = 0;
+    fsk->tx_phase_c.real = 1;
+    
+    /* Set up demod stats */
+    fsk->EbNodB = 0;
+    fsk->f1_est = 0;
+    fsk->f2_est = 0;
+    
+    return fsk;
+}
+
+uint32_t fsk_nin(struct FSK *fsk){
+    return (uint32_t)fsk->nin;
+}
+
+void fsk_destroy(struct FSK *fsk){
+    free(fsk->fft_cfg);
+    free(fsk->samp_old);
+    free(fsk);
+}
+
+/*
+ * Internal function to estimate the frequencies of the two tones within a block of samples.
+ * This is split off because it is fairly complicated, needs a bunch of memory, and probably
+ * takes more cycles than the rest of the demod.
+ * Parameters:
+ * fsk - FSK struct from demod containing FSK config
+ * fsk_in - block of samples in this demod cycles, must be nin long
+ * f1_est - pointer to f1 estimate variable in demod
+ * f2_est - pointer to f2 estimate variable in demod
+ * twist - pointer to twist estimate in demod - Note: this isn't correct right now
+ */
+void fsk_demod_freq_est(struct FSK *fsk, float fsk_in[],float *f1_est,float *f2_est,float *twist){
+    int Ndft = fsk->Ndft;
+    int Fs = fsk->Fs;
+    int nin = fsk->nin;
+    int i,j;
+    int fft_samps;
+    float hann;
+    float max;
+    int m1,m2;
+    kiss_fftr_cfg fft_cfg = fsk->fft_cfg;
+    
+    /* Array to do complex FFT from using kiss_fft */
+    /* It'd probably make more sense here to use kiss_fftr */
+    //kiss_fft_scalar fftin[Ndft];
+    kiss_fft_scalar *fftin = (kiss_fft_scalar*)malloc(sizeof(kiss_fft_scalar)*Ndft);
+    //kiss_fft_cpx fftout[(Ndft/2)+1];
+    kiss_fft_cpx *fftout = (kiss_fft_cpx*)malloc(sizeof(kiss_fft_cpx)*(Ndft/2)+1);
+    fft_samps = nin<Ndft?nin:Ndft;
+    
+    /* Copy FSK buffer into reals of FFT buffer and apply a hann window */
+    for(i=0; i<fft_samps ; i++){
+        /* Note : This is a sort of bug copied from fsk_horus */
+        /* if N+Ts/2 > Ndft, the end of the hann window may be cut off */
+        /* resulting in a dirty FFT */
+        /* An easy fix would be to ensure that Ndft is always greater than N+Ts/2 
+         * instead of N */
+        /* This bug isn't a big deal and can't show up in the balloon config */
+        /* as 8192 > 8040 */
+        hann = sinf((M_PI*(float)i)/((float)nin-1));
+        
+        fftin[i] = (kiss_fft_scalar)hann*hann*fsk_in[i];
+    }
+    /* Zero out the remaining slots */
+    for(; i<Ndft;i++){
+        fftin[i] = 0;
+    }
+    
+    /* Do the FFT */
+    kiss_fftr(fft_cfg,fftin,fftout);
+    
+    /* Find the magnitude^2 of each freq slot and stash away in the real
+     * value, so this only has to be done once. Since we're only comparing
+     * these values and only need the mag of 2 points, we don't need to do
+     * a sqrt to each value */
+    for(i=0; i<Ndft/2; i++){
+        fftout[i].r = (fftout[i].r*fftout[i].r) + (fftout[i].i*fftout[i].i) ;
+    }
+    
+    /* Find the maximum */
+    max = 0;
+    m1 = 0;
+    for(i=0; i<Ndft/2; i++){
+        if(fftout[i].r > max){
+            max = fftout[i].r;
+            m1 = i;
+        }
+    }
+    
+    /* Zero out 100Hz around the maximum */
+    i = m1 - 100*Ndft/Fs;
+    i = i<0 ? 0 : i;
+    j = m1 + 100*Ndft/Fs;
+    j = j>Ndft/2 ? Ndft/2 : j;
+    
+    for(;i<j;i++)
+        fftout[i].r = 0;
+    
+    /* Find the other maximum */
+    max = 0;
+    m2 = 0;
+    for(i=0; i<Ndft/2; i++){
+        if(fftout[i].r > max){
+            max = fftout[i].r;
+            m2 = i;
+        }
+    }
+    
+    /* f1 is always the lower tone */
+    if(m1>m2){
+        j=m1;
+        m1=m2;
+        m2=j;
+    }
+    
+    *f1_est = (float)m1*(float)Fs/(float)Ndft;
+    *f2_est = (float)m2*(float)Fs/(float)Ndft;
+    *twist = 20*log10f((float)(m1)/(float)(m2));
+    //printf("ESTF - f1 = %f, f2 = %f, twist = %f \n",*f1_est,*f2_est,*twist);
+    
+    free(fftin);
+    free(fftout);
+}
+
+/*
+ * Euler's formula in a new convenient function
+ */
+static inline COMP comp_exp_j(float phi){
+    COMP res;
+    res.real = cosf(phi);
+    res.imag = sinf(phi);
+    return res;
+}
+
+/*
+ * Quick and easy complex 0
+ */
+static inline COMP comp0(){
+    COMP res;
+    res.real = 0;
+    res.imag = 0;
+    return res;
+}
+
+/*
+ * Compare the magnitude of a and b. if |a|>|b|, return true, otw false.
+ * This needs no square roots
+ */
+static inline int comp_mag_gt(COMP a,COMP b){
+    return ((a.real*a.real)+(a.imag*a.imag)) > ((b.real*b.real)+(b.imag*b.imag));
+}
+
+static inline COMP comp_normalize(COMP a){
+	float av = sqrtf((a.real*a.real)+(a.imag*a.imag));
+	COMP b;
+	b.real = a.real/av;
+	b.imag = a.imag/av;
+	return b;
+}
+
+
+void fsk_demod(struct FSK *fsk, uint8_t rx_bits[], float fsk_in[]){
+    int N = fsk->N;
+    int Ts = fsk->Ts;
+    int Rs = fsk->Rs;
+    int Fs = fsk->Fs;
+    int nsym = fsk->Nsym;
+    int nin = fsk->nin;
+    int P = fsk->P;
+    int Nmem = fsk->Nmem;
+    int i,j,dc_i,cbuf_i;
+    float ft1,ft2;
+    float twist; /* NOTE: This is not correct ATM */
+    int nstash = fsk->nstash;
+    COMP *f1_int, *f2_int;
+    COMP t1,t2;
+    float phi1_d = fsk->phi1_d;
+    float phi2_d = fsk->phi2_d;
+    float phi_ft = 0; /* Phase of fine timing estimator */
+    int nold = Nmem-nin;
+    float dphi1,dphi2,dphift;
+    float f1,f2;
+    float rx_timing,norm_rx_timing;//,old_norm_rx_timing;//,d_norm_rx_timing;
+    int using_old_samps;
+    float *sample_src;
+    COMP *f1_intbuf,*f2_intbuf;
+    
+    /* Estimate tone frequencies */
+    fsk_demod_freq_est(fsk,fsk_in,&f1,&f2,&twist);
+    
+    
+    /* allocate memory for the integrated samples */
+    /* Note: This must be kept after fsk_demod_freq_est for memory usage reasons */
+    //f1_int = (COMP*) alloca(sizeof(COMP)*(nsym+1)*P);
+    //f2_int = (COMP*) alloca(sizeof(COMP)*(nsym+1)*P);
+    
+    /* Allocate circular buffers for integration */
+    //f1_intbuf = (COMP*) alloca(sizeof(COMP)*Ts);
+    //f2_intbuf = (COMP*) alloca(sizeof(COMP)*Ts);
+        /* Note: This must be kept after fsk_demod_freq_est for memory usage reasons */
+    f1_int = (COMP*) malloc(sizeof(COMP)*(nsym+1)*P);
+    f2_int = (COMP*) malloc(sizeof(COMP)*(nsym+1)*P);
+    
+    /* Allocate circular buffers for integration */
+    f1_intbuf = (COMP*) malloc(sizeof(COMP)*Ts);
+    f2_intbuf = (COMP*) malloc(sizeof(COMP)*Ts);
+    
+    /* Note: This would all be quite a bit faster with complex oscillators, like
+     * TX. */
+    /* TODO: change these to complex oscillators */
+    /* Figure out how much to nudge each sample downmixer for every sample */
+    dphi1 = 2*M_PI*((float)(f1)/(float)(Fs));
+    dphi2 = 2*M_PI*((float)(f2)/(float)(Fs));
+
+
+    dc_i = 0;
+    cbuf_i = 0;
+    sample_src = &(fsk->samp_old[nstash-nold]);
+    using_old_samps = 1;
+    
+    /* Pre-fill integration buffer */
+    for(dc_i=0; dc_i<Ts-(Ts/P); dc_i++){
+        /* Switch sample source to new samples when we run out of old ones */
+        if(dc_i>=nold && using_old_samps){
+            sample_src = &fsk_in[0];
+            dc_i = 0;
+            using_old_samps = 0;
+        }
+        /* Downconvert and place into integration buffer */
+        f1_intbuf[dc_i]=fcmult(sample_src[dc_i],comp_exp_j(phi1_d));;
+        f2_intbuf[dc_i]=fcmult(sample_src[dc_i],comp_exp_j(phi2_d));;
+
+        /* Spin downconversion phases */
+        phi1_d -= dphi1;
+        phi2_d -= dphi2;
+        if(phi1_d<0) phi1_d+=2*M_PI;
+        if(phi2_d<0) phi2_d+=2*M_PI;
+    }
+    cbuf_i = dc_i;
+    
+    /* Integrate over Ts at offsets of Ts/P */
+    for(i=0; i<(nsym+1)*P; i++){
+        /* Downconvert and Place Ts/P samples in the integration buffers */
+        for(j=0; j<(Ts/P); j++,dc_i++){
+            /* Switch sample source to new samples when we run out of old ones */
+            if(dc_i>=nold && using_old_samps){
+                sample_src = &fsk_in[0];
+                dc_i = 0;
+                using_old_samps = 0;
+            }
+            /* Downconvert and place into integration buffer */
+            f1_intbuf[cbuf_i+j]=fcmult(sample_src[dc_i],comp_exp_j(phi1_d));;
+            f2_intbuf[cbuf_i+j]=fcmult(sample_src[dc_i],comp_exp_j(phi2_d));;
+
+            /* Spin downconversion phases */
+            phi1_d -= dphi1;
+            phi2_d -= dphi2;
+            if(phi1_d<0) phi1_d+=2*M_PI;
+            if(phi2_d<0) phi2_d+=2*M_PI;
+        }
+        cbuf_i += Ts/P;
+        if(cbuf_i>=Ts) cbuf_i = 0;
+        
+        /* Integrate over the integration buffers, save samples */
+        t1 = t2 = comp0();
+        for(j=0; j<Ts; j++){
+            t1 = cadd(t1,f1_intbuf[j]);
+            t2 = cadd(t2,f2_intbuf[j]);
+        }
+        f1_int[i] = t1;
+        f2_int[i] = t2;
+        
+    }
+
+    fsk->phi1_d = phi1_d;
+    fsk->phi2_d = phi2_d;
+
+    /* Stash samples away in the old sample buffer for the next round of bit getting */
+    memcpy((void*)&(fsk->samp_old[0]),(void*)&(fsk_in[nin-nstash]),sizeof(float)*nstash);
+    
+    /* Fine Timing Estimation */
+    /* Apply magic nonlinearity to f1_int and f2_int, shift down to 0, 
+     * exract angle */
+     
+    /* Figure out how much to spin the oscillator to extract magic spectral line */
+    dphift = 2*M_PI*((float)(Rs)/(float)(P*Rs));
+    t1=comp0();
+    for(i=0; i<(nsym+1)*P; i++){
+        /* Get abs of f1_int[i] and f2_int[i] */
+        ft1 = sqrtf( (f1_int[i].real*f1_int[i].real) + (f1_int[i].imag*f1_int[i].imag) );
+        ft2 = sqrtf( (f2_int[i].real*f2_int[i].real) + (f2_int[i].imag*f2_int[i].imag) );
+        /* Add and square 'em */
+        ft1 = ft1-ft2;
+        ft1 = ft1*ft1;
+        /* Spin the oscillator for the magic line shift */
+        /* Down shift and accumulate magic line */
+        t1 = cadd(t1,fcmult(ft1,comp_exp_j(phi_ft)));
+        phi_ft -= dphift;
+        if(phi_ft<0) phi_ft+=2*M_PI;
+    }
+    /* Get the magic angle */
+    norm_rx_timing =  -atan2f(t1.imag,t1.real)/(2*M_PI);
+    rx_timing = norm_rx_timing*(float)P;
+    
+    //old_norm_rx_timing = fsk->norm_rx_timing;
+    fsk->norm_rx_timing = norm_rx_timing;
+    
+    /* Estimate sample clock offset */
+    //d_norm_rx_timing = norm_rx_timing - old_norm_rx_timing;
+    
+    /* Filter out big jumps in due to nin change */
+    //if(fabsf(d_norm_rx_timing) < .2){
+    
+    /* Figure out how many samples are needed the next modem cycle */
+    if(norm_rx_timing > 0.25)
+        fsk->nin = N+Ts/2;
+    else if(norm_rx_timing < -0.25)
+        fsk->nin = N-Ts/2;
+    else
+        fsk->nin = N;
+    
+    /* Re-sample the integrators with linear interpolation magic */
+    int low_sample = (int)floorf(rx_timing);
+    float fract = rx_timing - (float)low_sample;
+    int high_sample = (int)ceilf(rx_timing);
+  
+    /* FINALLY, THE BITS */
+    /* also, resample f1_int,f2_int */
+    for(i=0; i<nsym; i++){
+        int st = (i+1)*P;
+        t1 =         fcmult(1-fract,f1_int[st+ low_sample]);
+        t1 = cadd(t1,fcmult(  fract,f1_int[st+high_sample]));
+        t2 =         fcmult(1-fract,f2_int[st+ low_sample]);
+        t2 = cadd(t2,fcmult(  fract,f2_int[st+high_sample]));
+        
+        /* THE BIT! */
+        rx_bits[i] = comp_mag_gt(t2,t1)?1:0;
+        /* Soft output goes here */
+    }
+    
+    free(f1_int);
+    free(f2_int);
+    free(f1_intbuf);
+    free(f2_intbuf);
+    
+    printf("rx_timing: %3.2f low_sample: %d high_sample: %d fract: %3.3f nin_next: %d\n", rx_timing, low_sample, high_sample, fract, fsk->nin);
+}
+
+
+void fsk_mod_realphase(struct FSK *fsk,float fsk_out[],uint8_t tx_bits[]){
+    float tx_phase = fsk->tx_phase; /* current TX phase */
+    int f1_tx = fsk->f1_tx;         /* '0' frequency */
+    int f2_tx = fsk->f2_tx;         /* '1' frequency */
+    int Ts = fsk->Ts;               /* samples-per-symbol */
+    int Fs = fsk->Fs;               /* sample freq */
+    int i,j;
+    uint8_t bit;
+    
+    /* delta-phase per cycle for both symbol freqs */
+    float dph_f1 = 2*M_PI*((float)(f1_tx)/(float)(Fs));
+    float dph_f2 = 2*M_PI*((float)(f2_tx)/(float)(Fs));
+
+    /* Note: Right now, this mirrors fm.c and fsk_horus.m, but it
+     * ought to be possible to make it more efficent (one complex mul per
+     * cycle) by using a complex number for the phase. */
+     
+    /* Outer loop through bits */
+    for(i=0; i<fsk->Nsym; i++){
+        bit = tx_bits[i];
+        for(j=0; j<Ts; j++){
+            /* Nudge phase forward a bit */
+            tx_phase += (bit==0)?dph_f1:dph_f2;
+            /* Make sure phase stays on [0,2pi] */
+            if(tx_phase>2*M_PI)
+                tx_phase -= 2*M_PI;
+            fsk_out[i*Ts+j] = 2*cosf(tx_phase);
+        }
+    }
+    
+    /* save TX phase */
+    fsk->tx_phase = tx_phase;
+    
+}
+
+void fsk_mod(struct FSK *fsk,float fsk_out[],uint8_t tx_bits[]){
+    COMP tx_phase_c = fsk->tx_phase_c; /* Current complex TX phase */
+    int f1_tx = fsk->f1_tx;         /* '0' frequency */
+    int f2_tx = fsk->f2_tx;         /* '1' frequency */
+    int Ts = fsk->Ts;               /* samples-per-symbol */
+    int Fs = fsk->Fs;               /* sample freq */
+    COMP dosc_f1, dosc_f2;          /* phase shift per sample */
+    COMP dph;                       /* phase shift of current bit */
+    int i,j;
+    float tx_phase_mag;
+    
+    /* Figure out the amount of phase shift needed per sample */
+    /*dosc_f1.real = cosf(2*M_PI*((float)(f1_tx)/(float)(Fs)));
+    dosc_f1.imag = sinf(2*M_PI*((float)(f1_tx)/(float)(Fs)));
+    dosc_f2.real = cosf(2*M_PI*((float)(f2_tx)/(float)(Fs)));
+    dosc_f2.imag = sinf(2*M_PI*((float)(f2_tx)/(float)(Fs)));*/
+    dosc_f1 = comp_exp_j(2*M_PI*((float)(f1_tx)/(float)(Fs)));
+    dosc_f2 = comp_exp_j(2*M_PI*((float)(f2_tx)/(float)(Fs)));
+    
+    /* Outer loop through bits */
+    for(i=0; i<fsk->Nsym; i++){
+        /* select current bit phase shift */
+        dph = tx_bits[i]==0?dosc_f1:dosc_f2;
+        for(j=0; j<Ts; j++){
+            tx_phase_c = cmult(tx_phase_c,dph);
+            fsk_out[i*Ts+j] = 2*tx_phase_c.real;
+        }
+    }
+    
+    /* Normalize TX phase to prevent drift */
+    tx_phase_mag = cabsolute(tx_phase_c);
+    tx_phase_c.real = tx_phase_c.real/tx_phase_mag;
+    tx_phase_c.imag = tx_phase_c.imag/tx_phase_mag;
+    
+    /* save TX phase */
+    fsk->tx_phase_c = tx_phase_c;
+    
+}
+
+
+
+
+
+
+
+
+
+
diff --git a/codec2-dev/src/fsk.h b/codec2-dev/src/fsk.h
new file mode 100644
index 00000000..85e624ce
--- /dev/null
+++ b/codec2-dev/src/fsk.h
@@ -0,0 +1,94 @@
+/*---------------------------------------------------------------------------*\
+
+  FILE........: fsk.h
+  AUTHOR......: Brady O'Brien
+  DATE CREATED: 6 January 2016
+
+  C Implementation of 2FSK modulator/demodulator, based on octave/fsk_horus.m
+
+\*---------------------------------------------------------------------------*/
+
+/*
+  Copyright (C) 2016 David Rowe
+
+  All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU Lesser General Public License version 2.1, as
+  published by the Free Software Foundation.  This program is
+  distributed in the hope that it will be useful, but WITHOUT ANY
+  WARRANTY; without even the implied warranty of MERCHANTABILITY or
+  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+  License for more details.
+
+  You should have received a copy of the GNU Lesser General Public License
+  along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+
+#ifndef __C2FSK_H
+#define __C2FSK_H
+#include <stdint.h>
+#include "comp.h"
+#include "kiss_fftr.h"
+
+struct FSK {
+    /*  Static parameters set up by fsk_init */
+    int Ndft;               /* buffer size for freq offset est fft */
+    int Fs;                 /* sample freq */
+    int N;                  /* processing buffer size */
+    int Rs;                 /* symbol rate */
+    int Ts;                 /* samples per symbol */
+    int Nmem;               /* size of extra mem for timing adj */
+    int P;                  /* oversample rate for timing est/adj */
+    int Nsym;               /* Number of symbols spat out in a processing frame */
+    int f1_tx;              /* f1 for modulator */
+    int f2_tx;              /* f2 for modulator */
+    
+    /*  Parameters used by demod */
+    float phi1_d;           /* f1 oscillator for demod */
+    float phi2_d;           /* f2 oscillator for demod */
+    COMP phi1_c;
+    COMP phi2_c;
+    kiss_fftr_cfg fft_cfg;  /* Config for KISS FFT, used in freq est */
+    float norm_rx_timing;   /* Normalized RX timing */
+    
+    float *samp_old;        /* Ass end of last batch of samples */
+    int nstash;             /* How many elements are in there */
+    
+    /* Memory used by demod but not important between demod frames */
+    
+    /*  Parameters used by mod */
+    float tx_phase;         /* phase of modulator */
+    COMP tx_phase_c;        /* TX phase, but complex */ 
+    
+    /*  Statistics generated by demod */
+    float EbNodB;           /* Estimated EbNo in dB */
+    int f1_est;             /* Estimated f1 freq. */
+    int f2_est;             /* Estimated f2 freq. */
+    
+    /*  Parameters used by mod/demod and driving code */
+    int nin;                /* Number of samples to feed the next demod cycle */
+    
+};
+
+struct FSK * fsk_create(int Fs, int Rs, int tx_f1, int tx_f2);
+void fsk_destroy(struct FSK *fsk);
+
+void fsk_mod(struct FSK *fsk, 
+             float fsk_out[],   /* N float output modulated FSK samples */
+             uint8_t tx_bits[]  /* Nsym unpacked input bits in LSB  */
+             );
+
+/* number of input samples required for next call to fsk_demod() */
+
+uint32_t fsk_nin(struct FSK *fsk);
+
+void fsk_demod(struct FSK *fsk, 
+               uint8_t rx_bits[],     /* Nsym unpacked output demodulated rx bits in each LSB */
+               float fsk_in[]         /* N input float modulated FSK samples */
+               );
+
+
+
+#endif
diff --git a/codec2-dev/src/fsk_demod.c b/codec2-dev/src/fsk_demod.c
new file mode 100644
index 00000000..a8eaa310
--- /dev/null
+++ b/codec2-dev/src/fsk_demod.c
@@ -0,0 +1,79 @@
+/*---------------------------------------------------------------------------*\
+
+  FILE........: fsk_demod.c
+  AUTHOR......: Brady O'Brien
+  DATE CREATED: 8 January 2016
+
+  C test driver for fsk_demod in fsk.c. Reads in a stream of 32 bit cpu endian
+  floats and writes out the detected bits
+   
+
+\*---------------------------------------------------------------------------*/
+
+/*
+  Copyright (C) 2016 David Rowe
+
+  All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU Lesser General Public License version 2.1, as
+  published by the Free Software Foundation.  This program is
+  distributed in the hope that it will be useful, but WITHOUT ANY
+  WARRANTY; without even the implied warranty of MERCHANTABILITY or
+  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+  License for more details.
+
+  You should have received a copy of the GNU Lesser General Public License
+  along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <stdio.h>
+#include "fsk.h"
+
+int main(int argc,char *argv[]){
+    struct FSK *fsk;
+    int Fs,Rs,f1,f2;
+    FILE *fin,*fout;
+    uint8_t *bitbuf;
+    float *modbuf;
+    
+    if(argc<7){
+        printf("Usage: %s samplerate bitrate f1 f2 rffile bitfile\n",argv[0]);
+        exit(1);
+    }
+    
+    /* Extract parameters */
+    Fs = atoi(argv[1]);
+    Rs = atoi(argv[2]);
+    f1 = atoi(argv[3]);
+    f2 = atoi(argv[4]);
+    
+    /* Open files */
+    fin = fopen(argv[5],"r");
+    fout = fopen(argv[6],"w");
+    
+    /* set up FSK */
+    fsk = fsk_create(Fs,Rs,f1,f2);
+    
+    if(fin==NULL || fout==NULL || fsk==NULL){
+        printf("Couldn't open test vector files\n");
+        goto cleanup;
+    }
+    
+    /* allocate buffers for processing */
+    bitbuf = (uint8_t*)alloca(sizeof(uint8_t)*fsk->Nsym);
+    modbuf = (float*)alloca(sizeof(float)*(fsk->N+fsk->Ts*2));
+    
+    /* Demodulate! */
+    while( fread(modbuf,sizeof(float),fsk_nin(fsk),fin) == fsk_nin(fsk) ){
+        fsk_demod(fsk,bitbuf,modbuf);
+        fwrite(bitbuf,sizeof(uint8_t),fsk->Nsym,fout);
+    }
+    
+    cleanup:
+    fclose(fin);
+    fclose(fout);
+    fsk_destroy(fsk);
+    exit(0);
+}
+
diff --git a/codec2-dev/src/fsk_mod.c b/codec2-dev/src/fsk_mod.c
new file mode 100644
index 00000000..2fafbde3
--- /dev/null
+++ b/codec2-dev/src/fsk_mod.c
@@ -0,0 +1,79 @@
+/*---------------------------------------------------------------------------*\
+
+  FILE........: fsk_mod.c
+  AUTHOR......: Brady O'Brien
+  DATE CREATED: 8 January 2016
+
+  C test driver for fsk_mod in fsk.c. Reads in a set of bits to modulate
+   from a file, passed as a parameter, and writes modulated output to
+   another file
+   
+
+\*---------------------------------------------------------------------------*/
+
+/*
+  Copyright (C) 2016 David Rowe
+
+  All rights reserved.
+
+  This program is free software; you can redistribute it and/or modify
+  it under the terms of the GNU Lesser General Public License version 2.1, as
+  published by the Free Software Foundation.  This program is
+  distributed in the hope that it will be useful, but WITHOUT ANY
+  WARRANTY; without even the implied warranty of MERCHANTABILITY or
+  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public
+  License for more details.
+
+  You should have received a copy of the GNU Lesser General Public License
+  along with this program; if not, see <http://www.gnu.org/licenses/>.
+*/
+
+#include <stdio.h>
+#include "fsk.h"
+
+int main(int argc,char *argv[]){
+    struct FSK *fsk;
+    int Fs,Rs,f1,f2;
+    FILE *fin,*fout;
+    uint8_t *bitbuf;
+    float *modbuf;
+    
+    if(argc<7){
+        printf("Usage: %s samplerate bitrate f1 f2 bitfile modfile\n",argv[0]);
+        exit(1);
+    }
+    
+    /* Extract parameters */
+    Fs = atoi(argv[1]);
+    Rs = atoi(argv[2]);
+    f1 = atoi(argv[3]);
+    f2 = atoi(argv[4]);
+    
+    /* Open files */
+    fin = fopen(argv[5],"r");
+    fout = fopen(argv[6],"w");
+    
+    /* set up FSK */
+    fsk = fsk_create(Fs,Rs,f1,f2);
+    
+    if(fin==NULL || fout==NULL || fsk==NULL){
+        printf("Couldn't open test vector files\n");
+        goto cleanup;
+    }
+    
+    /* allocate buffers for processing */
+    bitbuf = (uint8_t*)alloca(sizeof(uint8_t)*fsk->Nsym);
+    modbuf = (float*)alloca(sizeof(float)*fsk->N);
+    
+    /* Modulate! */
+    while( fread(bitbuf,sizeof(uint8_t),fsk->Nsym,fin) == fsk->Nsym ){
+        fsk_mod(fsk,modbuf,bitbuf);
+        fwrite(modbuf,sizeof(float),fsk->N,fout);
+    }
+    
+    cleanup:
+    fclose(fin);
+    fclose(fout);
+    fsk_destroy(fsk);
+    exit(0);
+}
diff --git a/codec2-dev/src/kiss_fftr.c b/codec2-dev/src/kiss_fftr.c
new file mode 100644
index 00000000..b8e238b1
--- /dev/null
+++ b/codec2-dev/src/kiss_fftr.c
@@ -0,0 +1,159 @@
+/*
+Copyright (c) 2003-2004, Mark Borgerding
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
+    * Neither the author nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#include "kiss_fftr.h"
+#include "_kiss_fft_guts.h"
+
+struct kiss_fftr_state{
+    kiss_fft_cfg substate;
+    kiss_fft_cpx * tmpbuf;
+    kiss_fft_cpx * super_twiddles;
+#ifdef USE_SIMD
+    void * pad;
+#endif
+};
+
+kiss_fftr_cfg kiss_fftr_alloc(int nfft,int inverse_fft,void * mem,size_t * lenmem)
+{
+    int i;
+    kiss_fftr_cfg st = NULL;
+    size_t subsize, memneeded;
+
+    if (nfft & 1) {
+        fprintf(stderr,"Real FFT optimization must be even.\n");
+        return NULL;
+    }
+    nfft >>= 1;
+
+    kiss_fft_alloc (nfft, inverse_fft, NULL, &subsize);
+    memneeded = sizeof(struct kiss_fftr_state) + subsize + sizeof(kiss_fft_cpx) * ( nfft * 3 / 2);
+
+    if (lenmem == NULL) {
+        st = (kiss_fftr_cfg) KISS_FFT_MALLOC (memneeded);
+    } else {
+        if (*lenmem >= memneeded)
+            st = (kiss_fftr_cfg) mem;
+        *lenmem = memneeded;
+    }
+    if (!st)
+        return NULL;
+
+    st->substate = (kiss_fft_cfg) (st + 1); /*just beyond kiss_fftr_state struct */
+    st->tmpbuf = (kiss_fft_cpx *) (((char *) st->substate) + subsize);
+    st->super_twiddles = st->tmpbuf + nfft;
+    kiss_fft_alloc(nfft, inverse_fft, st->substate, &subsize);
+
+    for (i = 0; i < nfft/2; ++i) {
+        double phase =
+            -3.14159265358979323846264338327 * ((double) (i+1) / nfft + .5);
+        if (inverse_fft)
+            phase *= -1;
+        kf_cexp (st->super_twiddles+i,phase);
+    }
+    return st;
+}
+
+void kiss_fftr(kiss_fftr_cfg st,const kiss_fft_scalar *timedata,kiss_fft_cpx *freqdata)
+{
+    /* input buffer timedata is stored row-wise */
+    int k,ncfft;
+    kiss_fft_cpx fpnk,fpk,f1k,f2k,tw,tdc;
+
+    if ( st->substate->inverse) {
+        fprintf(stderr,"kiss fft usage error: improper alloc\n");
+        exit(1);
+    }
+
+    ncfft = st->substate->nfft;
+
+    /*perform the parallel fft of two real signals packed in real,imag*/
+    kiss_fft( st->substate , (const kiss_fft_cpx*)timedata, st->tmpbuf );
+    /* The real part of the DC element of the frequency spectrum in st->tmpbuf
+     * contains the sum of the even-numbered elements of the input time sequence
+     * The imag part is the sum of the odd-numbered elements
+     *
+     * The sum of tdc.r and tdc.i is the sum of the input time sequence. 
+     *      yielding DC of input time sequence
+     * The difference of tdc.r - tdc.i is the sum of the input (dot product) [1,-1,1,-1... 
+     *      yielding Nyquist bin of input time sequence
+     */
+ 
+    tdc.r = st->tmpbuf[0].r;
+    tdc.i = st->tmpbuf[0].i;
+    C_FIXDIV(tdc,2);
+    CHECK_OVERFLOW_OP(tdc.r ,+, tdc.i);
+    CHECK_OVERFLOW_OP(tdc.r ,-, tdc.i);
+    freqdata[0].r = tdc.r + tdc.i;
+    freqdata[ncfft].r = tdc.r - tdc.i;
+#ifdef USE_SIMD    
+    freqdata[ncfft].i = freqdata[0].i = _mm_set1_ps(0);
+#else
+    freqdata[ncfft].i = freqdata[0].i = 0;
+#endif
+
+    for ( k=1;k <= ncfft/2 ; ++k ) {
+        fpk    = st->tmpbuf[k]; 
+        fpnk.r =   st->tmpbuf[ncfft-k].r;
+        fpnk.i = - st->tmpbuf[ncfft-k].i;
+        C_FIXDIV(fpk,2);
+        C_FIXDIV(fpnk,2);
+
+        C_ADD( f1k, fpk , fpnk );
+        C_SUB( f2k, fpk , fpnk );
+        C_MUL( tw , f2k , st->super_twiddles[k-1]);
+
+        freqdata[k].r = HALF_OF(f1k.r + tw.r);
+        freqdata[k].i = HALF_OF(f1k.i + tw.i);
+        freqdata[ncfft-k].r = HALF_OF(f1k.r - tw.r);
+        freqdata[ncfft-k].i = HALF_OF(tw.i - f1k.i);
+    }
+}
+
+void kiss_fftri(kiss_fftr_cfg st,const kiss_fft_cpx *freqdata,kiss_fft_scalar *timedata)
+{
+    /* input buffer timedata is stored row-wise */
+    int k, ncfft;
+
+    if (st->substate->inverse == 0) {
+        fprintf (stderr, "kiss fft usage error: improper alloc\n");
+        exit (1);
+    }
+
+    ncfft = st->substate->nfft;
+
+    st->tmpbuf[0].r = freqdata[0].r + freqdata[ncfft].r;
+    st->tmpbuf[0].i = freqdata[0].r - freqdata[ncfft].r;
+    C_FIXDIV(st->tmpbuf[0],2);
+
+    for (k = 1; k <= ncfft / 2; ++k) {
+        kiss_fft_cpx fk, fnkc, fek, fok, tmp;
+        fk = freqdata[k];
+        fnkc.r = freqdata[ncfft - k].r;
+        fnkc.i = -freqdata[ncfft - k].i;
+        C_FIXDIV( fk , 2 );
+        C_FIXDIV( fnkc , 2 );
+
+        C_ADD (fek, fk, fnkc);
+        C_SUB (tmp, fk, fnkc);
+        C_MUL (fok, tmp, st->super_twiddles[k-1]);
+        C_ADD (st->tmpbuf[k],     fek, fok);
+        C_SUB (st->tmpbuf[ncfft - k], fek, fok);
+#ifdef USE_SIMD        
+        st->tmpbuf[ncfft - k].i *= _mm_set1_ps(-1.0);
+#else
+        st->tmpbuf[ncfft - k].i *= -1;
+#endif
+    }
+    kiss_fft (st->substate, st->tmpbuf, (kiss_fft_cpx *) timedata);
+}
diff --git a/codec2-dev/src/kiss_fftr.h b/codec2-dev/src/kiss_fftr.h
new file mode 100644
index 00000000..72e5a577
--- /dev/null
+++ b/codec2-dev/src/kiss_fftr.h
@@ -0,0 +1,46 @@
+#ifndef KISS_FTR_H
+#define KISS_FTR_H
+
+#include "kiss_fft.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+    
+/* 
+ 
+ Real optimized version can save about 45% cpu time vs. complex fft of a real seq.
+
+ 
+ 
+ */
+
+typedef struct kiss_fftr_state *kiss_fftr_cfg;
+
+
+kiss_fftr_cfg kiss_fftr_alloc(int nfft,int inverse_fft,void * mem, size_t * lenmem);
+/*
+ nfft must be even
+
+ If you don't care to allocate space, use mem = lenmem = NULL 
+*/
+
+
+void kiss_fftr(kiss_fftr_cfg cfg,const kiss_fft_scalar *timedata,kiss_fft_cpx *freqdata);
+/*
+ input timedata has nfft scalar points
+ output freqdata has nfft/2+1 complex points
+*/
+
+void kiss_fftri(kiss_fftr_cfg cfg,const kiss_fft_cpx *freqdata,kiss_fft_scalar *timedata);
+/*
+ input freqdata has  nfft/2+1 complex points
+ output timedata has nfft scalar points
+*/
+
+#define kiss_fftr_free free
+
+#ifdef __cplusplus
+}
+#endif
+#endif