Navit SVN

/work/compile/navit/src/navit/support/espeak/synthesize.h

00001 /***************************************************************************
00002  *   Copyright (C) 2005 to 2007 by Jonathan Duddington                     *
00003  *   email: jonsd@users.sourceforge.net                                    *
00004  *                                                                         *
00005  *   This program is free software; you can redistribute it and/or modify  *
00006  *   it under the terms of the GNU General Public License as published by  *
00007  *   the Free Software Foundation; either version 3 of the License, or     *
00008  *   (at your option) any later version.                                   *
00009  *                                                                         *
00010  *   This program is distributed in the hope that it will be useful,       *
00011  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
00012  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
00013  *   GNU General Public License for more details.                          *
00014  *                                                                         *
00015  *   You should have received a copy of the GNU General Public License     *
00016  *   along with this program; if not, write see:                           *
00017  *               <http://www.gnu.org/licenses/>.                           *
00018  ***************************************************************************/
00019 
00020 
00021 #define N_PHONEME_LIST  1000    // enough for source[N_TR_SOURCE] full of text, else it will truncate
00022 
00023 #define MAX_HARMONIC  400           // 400 * 50Hz = 20 kHz, more than enough
00024 #define N_SEQ_FRAMES   25           // max frames in a spectrum sequence (real max is ablut 8)
00025 #define STEPSIZE  64                // 2.9mS at 22 kHz sample rate
00026 
00027 #define    PITCHfall   0
00028 #define    PITCHrise   1
00029 
00030 // flags set for frames within a spectrum sequence
00031 #define FRFLAG_KLATT           0x01   // this frame includes extra data for Klatt synthesizer
00032 #define FRFLAG_VOWEL_CENTRE    0x02   // centre point of vowel
00033 #define FRFLAG_LEN_MOD         0x04   // reduce effect of length adjustment
00034 #define FRFLAG_BREAK_LF        0x08   // but keep f3 upwards
00035 #define FRFLAG_BREAK           0x10   // don't merge with next frame
00036 #define FRFLAG_BREAK_2         0x18   // FRFLAG_BREAK_LF or FRFLAG_BREAK
00037 #define FRFLAG_FORMANT_RATE    0x20   // Flag5 allow increased rate of change of formant freq
00038 #define FRFLAG_MODULATE        0x40   // Flag6 modulate amplitude of some cycles to give trill
00039 #define FRFLAG_DEFER_WAV       0x80   // Flag7 defer mixing WAV until the next frame
00040 #define FRFLAG_COPIED        0x8000   // This frame has been copied into temporary rw memory
00041 
00042 #define SFLAG_SEQCONTINUE      0x01   // a liquid or nasal after a vowel, but not followed by a vowel
00043 #define SFLAG_EMBEDDED         0x02   // there are embedded commands before this phoneme
00044 #define SFLAG_SYLLABLE         0x04   // vowel or syllabic consonant
00045 #define SFLAG_LENGTHEN         0x08   // lengthen symbol : included after this phoneme
00046 #define SFLAG_DICTIONARY       0x10   // the pronunciation of this word was listed in the xx_list dictionary
00047 #define SFLAG_SWITCHED_LANG    0x20   // this word uses phonemes from a different language
00048 #define SFLAG_PROMOTE_STRESS   0x40   // this unstressed word can be promoted to stressed
00049 
00050 // embedded command numbers
00051 #define EMBED_P     1   // pitch
00052 #define EMBED_S     2   // speed (used in setlengths)
00053 #define EMBED_A     3   // amplitude/volume
00054 #define EMBED_R     4   // pitch range/expression
00055 #define EMBED_H     5   // echo/reverberation
00056 #define EMBED_T     6   // different tone for announcing punctuation
00057 #define EMBED_I     7   // sound icon
00058 #define EMBED_S2    8   // speed (used in synthesize)
00059 #define EMBED_Y     9   // say-as commands
00060 #define EMBED_M    10   // mark name
00061 #define EMBED_U    11   // audio uri
00062 #define EMBED_B    12   // break
00063 #define EMBED_F    13   // emphasis
00064 
00065 #define N_EMBEDDED_VALUES    14
00066 extern int embedded_value[N_EMBEDDED_VALUES];
00067 extern int embedded_default[N_EMBEDDED_VALUES];
00068 
00069 
00070 #define N_PEAKS   9
00071 #define N_MARKERS 8
00072 
00073 #define N_KLATTP   10   // this affects the phoneme data file format
00074 #define N_KLATTP2  14   // used in vowel files, with extra parameters for future extensions
00075 
00076 #define KLATT_AV      0
00077 #define KLATT_FNZ     1    // nasal zero freq
00078 #define KLATT_Tilt    2
00079 #define KLATT_Aspr    3
00080 #define KLATT_Skew    4
00081 
00082 #define KLATT_Kopen   5
00083 #define KLATT_AVp     6
00084 #define KLATT_Fric    7
00085 #define KLATT_FricBP  8
00086 #define KLATT_Turb    9
00087 
00088 
00089 
00090 typedef struct {  // 44 bytes
00091         short frflags;
00092         short ffreq[7];
00093         unsigned char length;
00094         unsigned char rms;
00095         unsigned char fheight[8];
00096         unsigned char fwidth[6];          // width/4  f0-5
00097         unsigned char fright[3];          // width/4  f0-2
00098         unsigned char bw[4];        // Klatt bandwidth BNZ /2, f1,f2,f3
00099         unsigned char klattp[5];    // AV, FNZ, Tilt, Aspr, Skew
00100         unsigned char klattp2[5];   // continuation of klattp[],  Avp, Fric, FricBP, Turb
00101         unsigned char klatt_ap[7];  // Klatt parallel amplitude
00102         unsigned char klatt_bp[7];  // Klatt parallel bandwidth  /2
00103 } frame_t;   //  with extra Klatt parameters for parallel resonators
00104 
00105 typedef struct {  // 44 bytes
00106         short frflags;
00107         short ffreq[7];
00108         unsigned char length;
00109         unsigned char rms;
00110         unsigned char fheight[8];
00111         unsigned char fwidth[6];          // width/4  f0-5
00112         unsigned char fright[3];          // width/4  f0-2
00113         unsigned char bw[4];        // Klatt bandwidth BNZ /2, f1,f2,f3
00114         unsigned char klattp[5];    // AV, FNZ, Tilt, Aspr, Skew
00115 } frame_t2;   //  TESTING
00116 
00117 
00118 #ifdef deleted
00119 typedef struct {
00120         short frflags;
00121         unsigned char length;
00122         unsigned char rms;
00123         short ffreq[9];
00124         unsigned char fheight[9];
00125         unsigned char fwidth[6];          // width/4
00126         unsigned char fright[6];          // width/4
00127         unsigned char fwidth6, fright6;
00128         unsigned char klattp[N_KLATTP];
00129 } frame_t;
00130 
00131 typedef struct {  // 43 bytes
00132         short frflags;
00133         unsigned char length;
00134         unsigned char rms;
00135         short ffreq[9];
00136         unsigned char fheight[9];
00137         unsigned char fwidth[6];          // width/4
00138         unsigned char fright[6];          // width/4
00139 } frame_t2;   //  the original, without Klatt additions, used for file "phondata" 
00140 #endif
00141 
00142 
00143 
00144 // formant data used by wavegen
00145 typedef struct {
00146         int freq;     // Hz<<16
00147         int height;   // height<<15
00148         int left;     // Hz<<16
00149         int right;    // Hz<<16
00150         DOUBLEX freq1; // floating point versions of the above
00151         DOUBLEX height1;
00152         DOUBLEX left1;
00153         DOUBLEX right1;
00154         DOUBLEX freq_inc;    // increment by this every 64 samples
00155         DOUBLEX height_inc;
00156         DOUBLEX left_inc;
00157         DOUBLEX right_inc;
00158 }  wavegen_peaks_t;
00159 
00160 typedef struct {
00161 unsigned char *pitch_env;
00162 int pitch;          // pitch Hz*256
00163 int pitch_ix;       // index into pitch envelope (*256)
00164 int pitch_inc;      // increment to pitch_ix
00165 int pitch_base;     // Hz*256 low, before modified by envelope
00166 int pitch_range;    // Hz*256 range of envelope
00167 
00168 unsigned char *mix_wavefile;  // wave file to be added to synthesis
00169 int n_mix_wavefile;       // length in bytes
00170 int mix_wave_scale;         // 0=2 byte samples
00171 int mix_wave_amp;
00172 int mix_wavefile_ix;
00173 
00174 int amplitude;
00175 int amplitude_v;
00176 int prev_was_synth;  // previous sound was synthesized (not a played wave or pause)
00177 } WGEN_DATA;
00178 
00179 
00180 typedef struct {
00181         double a;
00182         double b;
00183         double c;
00184         double x1;
00185         double x2;
00186 }  RESONATOR;
00187 
00188 
00189 typedef struct {
00190    short length_total;  // not used
00191    unsigned char  n_frames;
00192    unsigned char  flags;
00193    frame_t2  frame[N_SEQ_FRAMES];     // max. frames in a spectrum sequence
00194 } SPECT_SEQ;   // sequence of espeak formant frames
00195 
00196 typedef struct {
00197    short length_total;  // not used
00198    unsigned char  n_frames;
00199    unsigned char  flags;
00200    frame_t  frame[N_SEQ_FRAMES];     // max. frames in a spectrum sequence
00201 } SPECT_SEQK;   // sequence of klatt formants frames
00202 
00203 
00204 typedef struct {
00205         short length;
00206         short frflags;
00207         frame_t *frame;
00208 } frameref_t;
00209 
00210 
00211 typedef struct {
00212         PHONEME_TAB *ph;
00213         unsigned char env;    // pitch envelope number
00214         unsigned char stresslevel;
00215         unsigned char type;
00216         unsigned char prepause;
00217         unsigned char amp;
00218         unsigned char tone_ph;   // tone phoneme to use with this vowel
00219         unsigned char newword;   // bit 0=start of word, bit 1=end of clause, bit 2=start of sentence
00220         unsigned char synthflags;
00221         short length;  // length_mod
00222         short pitch1;  // pitch, 0-4095 within the Voice's pitch range
00223         short pitch2;
00224         unsigned short sourceix;  // ix into the original source text string, only set at the start of a word
00225 } PHONEME_LIST;
00226 
00227 
00228 typedef struct {
00229         int name;
00230         int length;
00231         char *data;
00232         char *filename;
00233 } SOUND_ICON;
00234 
00235 typedef struct {
00236         int  name;
00237         unsigned int  next_phoneme;
00238         int  mbr_name;
00239         int  mbr_name2;
00240         int  percent;         // percentage length of first component
00241         int  control;
00242 } MBROLA_TAB;
00243 
00244 typedef struct {
00245         int speed_factor1;
00246         int speed_factor2;
00247         int speed_factor3;
00248         int min_sample_len;
00249         int fast_settings[8];
00250 } SPEED_FACTORS;
00251 
00252 
00253 // phoneme table
00254 extern PHONEME_TAB *phoneme_tab[N_PHONEME_TAB];
00255 
00256 // list of phonemes in a clause
00257 extern int n_phoneme_list;
00258 extern PHONEME_LIST phoneme_list[N_PHONEME_LIST];
00259 extern unsigned int embedded_list[];
00260 
00261 extern unsigned char env_fall[128];
00262 extern unsigned char env_rise[128];
00263 extern unsigned char env_frise[128];
00264 
00265 #define MAX_PITCH_VALUE  101
00266 extern unsigned char pitch_adjust_tab[MAX_PITCH_VALUE+1];
00267 
00268 // queue of commands for wavegen
00269 #define WCMD_KLATT      1
00270 #define WCMD_KLATT2     2
00271 #define WCMD_SPECT      3
00272 #define WCMD_SPECT2     4
00273 #define WCMD_PAUSE      5
00274 #define WCMD_WAVE    6
00275 #define WCMD_WAVE2   7
00276 #define WCMD_AMPLITUDE 8
00277 #define WCMD_PITCH      9
00278 #define WCMD_MARKER     10
00279 #define WCMD_VOICE   11
00280 #define WCMD_EMBEDDED 12
00281 
00282 
00283 #define N_WCMDQ   160
00284 #define MIN_WCMDQ  22   // need this many free entries before adding new phoneme
00285 
00286 extern long wcmdq[N_WCMDQ][4];
00287 extern int wcmdq_head;
00288 extern int wcmdq_tail;
00289 
00290 // from Wavegen file
00291 int  WcmdqFree();
00292 void WcmdqStop();
00293 int  WcmdqUsed();
00294 void WcmdqInc();
00295 int  WavegenOpenSound();
00296 int  WavegenCloseSound();
00297 int  WavegenInitSound();
00298 void WavegenInit(int rate, int wavemult_fact);
00299 float polint(float xa[],float ya[],int n,float x);
00300 int WavegenFill(int fill_zeros);
00301 void MarkerEvent(int type, unsigned int char_position, int value, unsigned char *out_ptr);
00302 
00303 
00304 extern unsigned char *wavefile_data;
00305 extern int samplerate;
00306 extern int samplerate_native;
00307 
00308 extern int wavefile_ix;
00309 extern int wavefile_amp;
00310 extern int wavefile_ix2;
00311 extern int wavefile_amp2;
00312 extern int vowel_transition[4];
00313 extern int vowel_transition0, vowel_transition1;
00314 
00315 extern int mbrola_delay;
00316 extern char mbrola_name[20];
00317 
00318 // from synthdata file
00319 unsigned int LookupSound(PHONEME_TAB *ph1, PHONEME_TAB *ph2, int which, int *match_level, int control);
00320 frameref_t *LookupSpect(PHONEME_TAB *ph1, PHONEME_TAB *prev_ph, PHONEME_TAB *next_ph, int which, int *match_level, int *n_frames, PHONEME_LIST *plist);
00321 
00322 unsigned char *LookupEnvelope(int ix);
00323 int LoadPhData();
00324 
00325 void SynthesizeInit(void);
00326 int  Generate(PHONEME_LIST *phoneme_list, int *n_ph, int resume);
00327 void MakeWave2(PHONEME_LIST *p, int n_ph);
00328 int  SynthOnTimer(void);
00329 int  SpeakNextClause(FILE *f_text, const void *text_in, int control);
00330 int  SynthStatus(void);
00331 void SetSpeed(int control);
00332 void SetEmbedded(int control, int value);
00333 void SelectPhonemeTable(int number);
00334 int  SelectPhonemeTableName(const char *name);
00335 
00336 void Write4Bytes(FILE *f, int value);
00337 int Read4Bytes(FILE *f);
00338 int CompileDictionary(const char *dsource, const char *dict_name, FILE *log, char *err_name,int flags);
00339 
00340 
00341 extern unsigned char *envelope_data[18];
00342 extern int formant_rate[];         // max rate of change of each formant
00343 extern SPEED_FACTORS speed;
00344 
00345 extern long count_samples;
00346 extern int outbuf_size;
00347 extern unsigned char *out_ptr;
00348 extern unsigned char *out_start;
00349 extern unsigned char *out_end;
00350 extern int event_list_ix;
00351 extern espeak_EVENT *event_list;
00352 extern t_espeak_callback* synth_callback;
00353 extern int option_log_frames;
00354 extern const char *version_string;
00355 extern const int version_phdata;
00356 
00357 #define N_SOUNDICON_TAB  80   // total entries in soundicon_tab
00358 #define N_SOUNDICON_SLOTS 4    // number of slots reserved for dynamic loading of audio files
00359 extern int n_soundicon_tab;
00360 extern SOUND_ICON soundicon_tab[N_SOUNDICON_TAB];
00361 
00362 espeak_ERROR SetVoiceByName(const char *name);
00363 espeak_ERROR SetVoiceByProperties(espeak_VOICE *voice_selector);
00364 espeak_ERROR LoadMbrolaTable(const char *mbrola_voice, const char *phtrans, int srate);
00365 void SetParameter(int parameter, int value, int relative);
00366 void MbrolaTranslate(PHONEME_LIST *plist, int n_phonemes, FILE *f_mbrola);
00367 //int MbrolaSynth(char *p_mbrola);
00368 int DoSample(PHONEME_TAB *ph1, PHONEME_TAB *ph2, int which, int length_mod, int amp);
00369 int DoSpect(PHONEME_TAB *this_ph, PHONEME_TAB *prev_ph, PHONEME_TAB *next_ph,
00370                 int which, PHONEME_LIST *plist, int modulation);
00371 int PauseLength(int pause, int control);
00372 int LookupPhonemeTable(const char *name);
00373 
00374 void InitBreath(void);
00375 
00376 void KlattInit();
00377 int Wavegen_Klatt2(int length, int modulation, int resume, frame_t *fr1, frame_t *fr2);