Let’s start with his voice! (Part 5 – Expanding HERO’s vocabulary)

The next logical step for me after getting HERO to say the words associated with the byte codes I send from the Raspberry Pi master is to use the processing and storage of the Raspberry Pi to create a dictionary. Here are the goals:

  1. Create a database which is a dictionary of words to phonemes
  2. Assign byte codes to the phonemes
  3. Create a C++ library class that can:
    1. Take a phrase
    2. Break it down to words
    3. Locate the words in the database dictionary
    4. Map it to phonemes
    5. send the phonemes via I2C to the Raspberry Pi

As the Arduino can already handle the I2C slave communication and sending of bytes to HERO’s speech board based on a previous blog post, everything in this project revolves around the dictionary and library creation.

Step 1: Create a database which is a dictionary of words to phonemes

The first thought that went through my head on solving this piece of the puzzle was that every dictionary has a breakdown of words into phonemes.  These phonemes are all defined as well by the International Phonetic Alphabet.  If I could find an online word list, or CSV dictionary of some form, that gives each word in the dictionary and the phonemes, I’d be well on my way.  This, of course, would leave out slang and proper names… but I figured one problem at a time.  While looking for this list, I stumbled on something MUCH better!  Apparently, Carnegie Mellon University publishes a machine-readable “Pronouncing Dictionary.”  I then installed SQL lite on the Raspberry Pi, downloaded the cmudict-0.7b* files from Carnegie Mellon University, and threw together the following C program to load the dictionary:

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sqlite3.h>

char* dbname="../dictionary.db";

void createPhonemeList(sqlite3* db);
void createWordList(sqlite3* db);
void remove_newline_ch(char *line);
char *str_replace(char *orig, char *rep, char *with);

int main()
{
  sqlite3* db;
  int result=sqlite3_open(dbname,&db) ;
  if (result != SQLITE_OK)
  {
    printf("Failed to open database nr");
    sqlite3_close(db) ;
    return 1;
  }
  printf("Opened db %s OKnr",dbname) ;

  createPhonemeList(db);
  createWordList(db);
  return 0;
}

// You must free the result if result is non-NULL.
char *str_replace(char *orig, char *rep, char *with) 
{
  char *result; // the return string
  char *ins;    // the next insert point
  char *tmp;    // varies
  int len_rep;  // length of rep
  int len_with; // length of with
  int len_front; // distance between rep and end of last rep
  int count;    // number of replacements

  if (!orig)
    return NULL;
  if (!rep)
    rep = "";
  len_rep = strlen(rep);
  if (!with)
    with = "";
  len_with = strlen(with);

  ins = orig;
  for (count = 0; tmp = strstr(ins, rep); ++count) {
    ins = tmp + len_rep;
  }

  // first time through the loop, all the variable are set correctly
  // from here on,
  //    tmp points to the end of the result string
  //    ins points to the next occurrence of rep in orig
  //    orig points to the remainder of orig after "end of rep"
  tmp = result = malloc(strlen(orig) + (len_with - len_rep) * count + 1);

  if (!result)
    return NULL;

  while (count--) {
    ins = strstr(orig, rep);
    len_front = ins - orig;
    tmp = strncpy(tmp, orig, len_front) + len_front;
    tmp = strcpy(tmp, with) + len_with;
    orig += len_front + len_rep; // move to next "end of rep"
  }
  strcpy(tmp, orig);
  return result;
}

void remove_newline_ch(char *line)
{
  int new_line = strlen(line) -1;
  if (line[new_line] == 'n')
    line[new_line] = '';
}

void createPhonemeList(sqlite3* db)
{
  char* create = "create table CMU_Phonemes(phoneme varchar(5));";
  char *zErrMsg = 0;
  char insert[100];
  int rc;

  rc = sqlite3_exec(db, create, NULL, 0, &zErrMsg);
  if( rc!=SQLITE_OK ){
    fprintf(stderr, "SQL error: %sn", zErrMsg);
    sqlite3_free(zErrMsg);
  }

  FILE *fp;
  int c;
  char line[80];
  fp = fopen("cmudict-0.7b.symbols","r");
  while(fgets(line, 80, fp) != NULL)
  {
    remove_newline_ch(line);
    sprintf(insert, "insert into CMU_Phonemes(phoneme) values ('%s');", line);
    printf("%sn", insert);
    rc = sqlite3_exec(db, insert, NULL, 0, &zErrMsg);
    if( rc!=SQLITE_OK ){
      fprintf(stderr, "SQL error: %sn", zErrMsg);
      sqlite3_free(zErrMsg);
    }  
  }
  fclose(fp);
}

void createWordList(sqlite3* db)
{
  char* createW = "create table CMU_Words(wordid int, word varchar(255));";
  char* createX = "create table CMU_WXP(wordid int, phoneme varchar(5), sort int);";
  char *zErrMsg = 0;
  char *es_line;
  char *tok;
  char insert[255];
  char three[4];
  int rc;
  int sort;
  int wordid = 0;

  rc = sqlite3_exec(db, createW, NULL, 0, &zErrMsg);
  if( rc!=SQLITE_OK ){
    fprintf(stderr, "SQL error: %sn", zErrMsg);
    sqlite3_free(zErrMsg);
  }

  rc = sqlite3_exec(db, createX, NULL, 0, &zErrMsg);
  if( rc!=SQLITE_OK ){
    fprintf(stderr, "SQL error: %sn", zErrMsg);
    sqlite3_free(zErrMsg);
  }

  FILE *fp;
  int c;
  char line[255];
  fp = fopen("cmudict-0.7b","r");
  while(fgets(line, 255, fp) != NULL)
  {
    strncpy(three, line, 4);
    three[3] = 0;
    if(strncmp(three, ";;;", 3) != 0)
    {
      remove_newline_ch(line);
      tok = strtok(line, " ");

      if(es_line = str_replace(tok, "'", "''"))
      {
        sprintf(insert, "insert into CMU_Words(wordid, word) values (%d, '%s');", wordid, es_line);
        printf("%sn", insert);
        rc = sqlite3_exec(db, insert, NULL, 0, &zErrMsg);
        if( rc!=SQLITE_OK ){
        fprintf(stderr, "SQL error: %sn", zErrMsg);
        sqlite3_free(zErrMsg);
      }
      free(es_line);

      sort = 0;
      while(tok = strtok(NULL, " "))
      {
        sprintf(insert, "insert into CMU_WXP(wordid, phoneme, sort) values (%d, '%s', %d);", wordid, tok, sort++);
        printf("%sn", insert);
        rc = sqlite3_exec(db, insert, NULL, 0, &zErrMsg);
        if( rc!=SQLITE_OK ){
          fprintf(stderr, "SQL error: %sn", zErrMsg);
          sqlite3_free(zErrMsg);
        }
      }
     }
     wordid++;
   }
  }
  fclose(fp);
}

Step 2: Assign byte codes to phonemes

I wish this part was more straightforward than it was.  As there are less SC-01-A phonemes than CMU phonemes, it was a grueling manual process of going through each of the phonemes from the list of CMU phonemes, and trying to determine which SC-01-A phoneme it best matched.  I then created a mapping table of CMU phonemes to SC-01-A phonemes.  This allows me to take a word form the CMU list, get the phonemes, and look up the corresponding SC-01-A phoneme.

Step 3: Build a C++ speech library class

Here’s what I came up with:

#include <cstddef>
#include <iostream>
#include <iomanip>
#include <string>
#include <sstream>
#include <iterator>
#include <vector>
#include <sqlite3.h>
#include <string.h>
#include <stdlib.h>
#include <unistd.h>
#include <linux/i2c.h>
#include <linux/i2c-dev.h>
#include <sys/ioctl.h>
#include <fcntl.h>
#include "speech.h"

using namespace std;
Speech::Speech(const char* db)
{
  #ifdef DEBUG
  cout << "Speech constructor called" << endl;
  #endif
  _db = NULL;
  int result=sqlite3_open_v2(db,&_db, SQLITE_OPEN_READONLY, NULL);
  #ifdef DEBUG
  cout << "db open result: " << result << endl;
  #endif
  if (result != SQLITE_OK)
  {
    sqlite3_close(_db) ;
    throw result;
  }

  int tenBitAddress = 0;
  int opResult = 0;

  // Create a file descriptor for the I2C bus
  _i2cHandle = open("/dev/i2c-1", O_RDWR);

  // I2C device is not 10-bit
  opResult = ioctl(_i2cHandle, I2C_TENBIT, tenBitAddress);
  #ifdef DEBUG
  cout << "Set I2C to non 10-bit mode result: " << opResult << endl;
  #endif

  // set address of speech board to I2C
  opResult = ioctl(_i2cHandle, I2C_SLAVE, I2C_SPEECH);
  #ifdef DEBUG
  cout << "Set I2C speech board address result: " << opResult << endl;
  #endif
}

Speech::~Speech()
{
  #ifdef DEBUG
  cout << "Speech deconstructor called" << endl;
  #endif
  if(_db != NULL)
  {
    #ifdef DEBUG
    cout << "Freeing database" << endl;
    #endif
    sqlite3_close(_db) ;
  }

  #ifdef DEBUG
  cout << "Freeing I2C handle" << endl;
  #endif
  close(_i2cHandle);
}

  vector<unsigned char> Speech::wordToPhonemeBytes(const char* word)
  {
  vector<unsigned char> bytes;
  if(!_db) return bytes;
  const char* pzTest;
  unsigned char byte;

  sqlite3_stmt *stmt;
  const char* sql = "select sp.code from CMU_Words w "
  "inner join CMU_WXP x on w.wordid= x.wordid "
  "inner join CMU_SC01 cs on x.phoneme = cs.CMU_PH "
  "inner join SC01_Phonemes sp on sp.phoneme = cs.SC01_PH "
  "where word = ? "
  "order by x.sort, cs.sort";

  int rc = sqlite3_prepare(_db, sql, strlen(sql), &stmt, &pzTest);
  if( rc != SQLITE_OK ) return bytes;

  sqlite3_bind_text(stmt, 1, word, strlen(word), 0);
 
  while(sqlite3_step(stmt) == SQLITE_ROW)
    bytes.push_back(sqlite3_column_int(stmt, 0));
  sqlite3_finalize(stmt);
  return bytes;
}

void Speech::toUpper(string& s) {
  for (string::iterator p = s.begin(); p != s.end(); ++p) {
    *p = toupper(*p); // toupper is for char
  }
}

bool Speech::send(vector<unsigned char> bytes)
{
  int opResult = 0;
  unsigned char c;
  char rxBuffer[32];    //    receive buffer
  for (vector<unsigned char>::iterator byte = bytes.begin(); byte != bytes.end(); ++byte)
  {
    c = *byte;
    opResult = write(_i2cHandle, &c, 1);
    if(opResult != 1)
    {
      #ifdef DEBUG
      cout << "No ACK bit!n";
      #endif
    }
    usleep(1000); //sleep for 1 millisecond
  }
  opResult = read(_i2cHandle, rxBuffer, 32);
  #ifdef DEBUG
  //cout << "Sent: " << bytes.size() << ", received: " << rxBuffer[0] << endl;
  #endif
  return true;
}

bool Speech::Say(const char* phrase)
{
  string words(phrase);
  istringstream buf(words);
  istream_iterator<string> beg(buf), end;
  vector<unsigned char> v_phrase, v_word;

  vector<string> tokens(beg, end); // done!

  for (vector<string>::iterator word = tokens.begin(); word != tokens.end(); ++word)
  {
    toUpper(*word);
    #ifdef DEBUG
    cout << "Getting bytes for: "" << *word << "": " << endl;
    #endif

    v_word = wordToPhonemeBytes((*word).c_str());
    if(v_word.empty())
    {
      #ifdef DEBUG
      cout << "tWord not found in dicitonary!" << endl;
      #endif
      for(int hold = 0; hold < 10; hold++)
        v_word.push_back(0x00);//make prolonged EH sound
    }
    else
    {
      #ifdef DEBUG
      for (vector<unsigned char>::iterator byte = v_word.begin(); byte != v_word.end(); ++byte)
        cout << "t" << hex << (int)*byte << endl;
      #endif
    }
    v_phrase.insert(v_phrase.end(), v_word.begin(), v_word.end());
    v_phrase.push_back(0x03); //pause between words
  }
  v_phrase.push_back(0x3F); // STOP
  v_phrase.push_back(0xFF); // Tell Aurdino to send

  #ifdef DEBUG
  cout << "sending:";
  for (vector<unsigned char>::iterator byte = v_phrase.begin(); byte != v_phrase.end(); ++byte)
  cout << " " << hex << (int)*byte;
  cout << endl;
  #endif
  send(v_phrase);
}

From this point, putting all of the pieces together with a quick test application was pretty straightforward:

#include <iostream>
#include "speech.h"

using namespace std;
int main(int argc, char **argv)
{
  Speech* sp;
  try
  {
    sp = new Speech("dictionary.db");
  }
  catch(int ex)
  {
    cout << "Speech library threw exception: " << ex << endl;
    return -1;
  }

  string phrase;
  while(phrase != "q")
  {
    cout << "Enter text: ";
    getline(cin, phrase);
    if(phrase != "q")
      sp->Say(phrase.c_str());
  }
  delete sp;
return 0;
}

At this point, I two outstanding issues for my speech interface:

  1. Minify the circuit
  2. Determine the actual byte codes for HERO’s old ROM based speeches described in Part 4, and recreate these as “EEPROM” speeches on the speech board controller (read Arduino).

We’re now caught up to the actual project I’m working on, so blog posts will begin to slow down as they will be more tied to current work in progress than explaining how I got to this stage.  For now, good night!

Leave a Reply

Your email address will not be published. Required fields are marked *